On Tue, 2006-09-19 at 13:13 +1000, Rusty Russell wrote: > Hi Jeremy, all, > > Sorry this took so long, spent last week in Japan at OSDL conf then > netconf. After several false starts, I ended up with a very simple > implementation, which clashes significantly with your work since then > 8(. I've pushed the patches anyway, but it's going to be significant > work for me to re-merge them, so I wanted your feedback first. OK, here's a patch against 2.6.18-rc6-mm2. Tested on UP and SMP. Crashes on hotplugging CPU, but crashes in same way as before the patch 8). Replace PDA with per-cpu section, and put GDT in per-cpu section. This patch uses the "gs" segment register which Jeremy Fitzhardinge freed up for kernel use, for the per-cpu section. This means that instead of having a special per-cpu struct which we can access in a single instruction, any per-cpu variable can be accessed in a single instruction. In addition, it avoids introducing the concept of a "pda" into the kernel, in favour of the well-known "percpu" concept. So, arch-specific code (eg. smp_processor_id()) can use x86_write_percpu()/x86_read_percpu() directly. Generic code expects an lvalue from __get_cpu_var(), but it takes two instruction to get the address of a per-cpu variable (still not bad). Ideally, we could use the __thread extension, and GCC would then generate optimal code when an lvalue isn't needed, however, the linker wants to use a negative offset within the gs register, which cannot be used with Xen (or any similar hypervisor), because it requires a 4GB segment, which would allow the OS to access the hypervisor memory. As an additional simplification, the GDT is placed directly in a per-cpu variable, rather than allocated dynamically. This is optimal for the UP case (previously, we made a copy even here), and signficantly simplfies the code. It's a little unusual to have asm access a per-cpu var, but it is only done early at boot, where the per-cpu GDT is sitting in the to-be-discarded section. More cleanups/optimizations are possible: 1) Don't save/restore %gs on UP. The cost is measurable, and we don't use it. 2) Remove early_smp_processor_id(), by setting up the per-cpu processor_id field correctly before starting a CPU. 3) Similarly, get rid of early_current(). 4) Implement cpu_local_* in terms of x86_read_percpu etc. Signed-off-by: Rusty Russell <rusty at rustcorp.com.au> diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/arch/i386/kernel/cpu/common.c working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/cpu/common.c --- linux-2.6.18-rc6-mm2/arch/i386/kernel/cpu/common.c 2006-09-19 14:54:22.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/cpu/common.c 2006-09-19 15:27:29.000000000 +1000 @@ -19,18 +19,14 @@ #include <asm/apic.h> #include <mach_apic.h> #endif -#include <asm/pda.h> #include "cpu.h" -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); - DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(_cpu_pda); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; @@ -592,141 +587,10 @@ void __init early_cpu_init(void) struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xgs = __KERNEL_PDA; + regs->xgs = __KERNEL_PERCPU; return regs; } -__cpuinit int alloc_gdt(int cpu) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - /* - * This is a horrible hack to allocate the GDT. The problem - * is that cpu_init() is called really early for the boot CPU - * (and hence needs bootmem) but much later for the secondary - * CPUs, when bootmem will have gone away - */ - if (NODE_DATA(0)->bdata->node_bootmem_map) { - BUG_ON(gdt != NULL || pda != NULL); - - gdt = alloc_bootmem_pages(PAGE_SIZE); - pda = alloc_bootmem(sizeof(*pda)); - /* alloc_bootmem(_pages) panics on failure, so no check */ - - memset(gdt, 0, PAGE_SIZE); - memset(pda, 0, sizeof(*pda)); - } else { - /* GDT and PDA might already have been allocated if - this is a CPU hotplug re-insertion. */ - if (gdt == NULL) - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - - if (pda == NULL) - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); - - if (unlikely(!gdt || !pda)) { - free_pages((unsigned long)gdt, 0); - kfree(pda); - return 0; - } - } - - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_pda(cpu) = pda; - - return 1; -} - -static __cpuinit void pda_init(int cpu, struct task_struct *curr) -{ - struct i386_pda *pda = cpu_pda(cpu); - - memset(pda, 0, sizeof(*pda)); - - pda->cpu_number = cpu; - pda->pcurrent = curr; - - printk("cpu %d current %p\n", cpu, curr); -} - -static inline void set_kernel_gs(void) -{ - /* Set %gs for this CPU's PDA. Memory clobber is to create a - barrier with respect to any PDA operations, so the compiler - doesn't move any before here. */ - asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); -} - -/* Initialize the CPU's GDT and PDA */ -static __cpuinit void init_gdt(void) -{ - int cpu = early_smp_processor_id(); - struct task_struct *curr = early_current(); - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - /* For non-boot CPUs, the GDT and PDA should already have been - allocated. */ - if (!alloc_gdt(cpu)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); - for (;;) - local_irq_enable(); - } - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - BUG_ON(gdt == NULL || pda == NULL); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr->size = GDT_SIZE - 1; - - /* Set up GDT entry for 16bit stack */ - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | - (CPU_16BIT_STACK_SIZE - 1); - - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ - - load_gdt(cpu_gdt_descr); - set_kernel_gs(); - - /* Do this once everything GDT-related has been set up. */ - pda_init(cpu, curr); -} - -/* Set up a very early PDA for the boot CPU so that smp_processor_id() - and current will work. */ -void __init smp_setup_processor_id(void) -{ - static __initdata struct i386_pda boot_pda; - - pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a, - (u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b, - (unsigned long)&boot_pda, sizeof(struct i386_pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ - - boot_pda.pcurrent = early_current(); - - /* Set %gs for this CPU's PDA */ - set_kernel_gs(); -} - /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -740,15 +604,27 @@ void __cpuinit cpu_init(void) struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; + struct desc_struct *gdt; + u32 stk16_off; if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) local_irq_enable(); } - /* Init the GDT and PDA early, before calling printk(), - since it may end up using the PDA indirectly. */ - init_gdt(); + /* Complete percpu area setup early, before calling printk(), + since it may end up using it indirectly. */ + setup_percpu_for_this_cpu(cpu); + /* FIXME: Always the idle thread, can get rid of early_current. */ + __get_cpu_var(current_task) = curr; + + /* Set up GDT entry for 16bit stack */ + stk16_off = (u32)&__get_cpu_var(cpu_16bit_stack); + gdt = __get_cpu_var(cpu_gdt_table); + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | + (CPU_16BIT_STACK_SIZE - 1); printk(KERN_INFO "Initializing CPU#%d\n", cpu); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/arch/i386/kernel/entry.S working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/entry.S --- linux-2.6.18-rc6-mm2/arch/i386/kernel/entry.S 2006-09-19 14:54:23.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/entry.S 2006-09-19 15:26:28.000000000 +1000 @@ -125,7 +125,7 @@ VM_MASK = 0x00020000 movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ - movl $(__KERNEL_PDA), %edx; \ + movl $(__KERNEL_PERCPU), %edx; \ movl %edx, %gs #define RESTORE_INT_REGS \ @@ -638,7 +638,7 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es - movl $(__KERNEL_PDA), %ecx + movl $(__KERNEL_PERCPU), %ecx movl %ecx, %gs movl %esp,%eax # pt_regs pointer call *%edi diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/arch/i386/kernel/head.S working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/head.S --- linux-2.6.18-rc6-mm2/arch/i386/kernel/head.S 2006-09-19 14:54:23.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/head.S 2006-09-19 15:23:48.000000000 +1000 @@ -302,7 +302,7 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 - lgdt cpu_gdt_descr + lgdt per_cpu__cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers @@ -523,12 +523,6 @@ idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries .long idt_table -# boot GDT descriptor (later on used by CPU#0): - .word 0 # 32 bit align gdt_desc.address -cpu_gdt_descr: - .word GDT_ENTRIES*8-1 - .long cpu_gdt_table - /* * The boot_gdt_table must mirror the equivalent in setup.S and is * used only for booting. @@ -539,55 +533,3 @@ ENTRY(boot_gdt_table) .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ -/* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ - .align L1_CACHE_BYTES -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* 0x0b reserved */ - .quad 0x0000000000000000 /* 0x13 reserved */ - .quad 0x0000000000000000 /* 0x1b reserved */ - .quad 0x0000000000000000 /* 0x20 unused */ - .quad 0x0000000000000000 /* 0x28 unused */ - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ - .quad 0x0000000000000000 /* 0x4b reserved */ - .quad 0x0000000000000000 /* 0x53 reserved */ - .quad 0x0000000000000000 /* 0x5b reserved */ - - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ - - .quad 0x0000000000000000 /* 0x80 TSS descriptor */ - .quad 0x0000000000000000 /* 0x88 LDT descriptor */ - - /* - * Segments used for calling PnP BIOS have byte granularity. - * They code segments and data segments have fixed 64k limits, - * the transfer segment sizes are set at run time. - */ - .quad 0x00409a000000ffff /* 0x90 32-bit code */ - .quad 0x00009a000000ffff /* 0x98 16-bit code */ - .quad 0x000092000000ffff /* 0xa0 16-bit data */ - .quad 0x0000920000000000 /* 0xa8 16-bit data */ - .quad 0x0000920000000000 /* 0xb0 16-bit data */ - - /* - * The APM segments have byte granularity and their bases - * are set at run time. All have 64k limits. - */ - .quad 0x00409a000000ffff /* 0xb8 APM CS code */ - .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x004092000000ffff /* 0xc8 APM DS data */ - - .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ - .quad 0x0000000000000000 /* 0xd8 - PDA */ - .quad 0x0000000000000000 /* 0xe0 - unused */ - .quad 0x0000000000000000 /* 0xe8 - unused */ - .quad 0x0000000000000000 /* 0xf0 - unused */ - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/arch/i386/kernel/process.c working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/process.c --- linux-2.6.18-rc6-mm2/arch/i386/kernel/process.c 2006-09-19 14:54:24.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/process.c 2006-09-19 15:26:28.000000000 +1000 @@ -38,6 +38,7 @@ #include <linux/ptrace.h> #include <linux/random.h> #include <linux/personality.h> +#include <linux/percpu.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -56,7 +57,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/pda.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -345,7 +345,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xgs = __KERNEL_PDA; + regs.xgs = __KERNEL_PERCPU; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -684,7 +684,7 @@ struct task_struct fastcall * __switch_t if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); - write_pda(pcurrent, next_p); + x86_write_percpu(current_task, next_p); /* * Restore IOPL if needed. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/arch/i386/kernel/setup.c working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/setup.c --- linux-2.6.18-rc6-mm2/arch/i386/kernel/setup.c 2006-09-19 14:54:24.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/setup.c 2006-09-19 15:26:28.000000000 +1000 @@ -1470,6 +1470,52 @@ void __init setup_arch(char **cmdline_p) tsc_init(); } +/* + * The Global Descriptor Table contains 28 quadwords, per-CPU. + */ +__attribute__((aligned(L1_CACHE_BYTES))) +DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]) = +{ + /* kernel 4GB code at 0x00000000 */ + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, + /* kernel 4GB data at 0x00000000 */ + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, + /* user 4GB code at 0x00000000 */ + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, + /* user 4GB data at 0x00000000 */ + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + [GDT_ENTRY_PNPBIOS_BASE] = + { 0x0000ffff, 0x00409a00 }, /* 32-bit code */ + { 0x0000ffff, 0x00009a00 }, /* 16-bit code */ + { 0x0000ffff, 0x00009200 }, /* 16-bit data */ + { 0x00000000, 0x00009200 }, /* 16-bit data */ + { 0x00000000, 0x00009200 }, /* 16-bit data */ + + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + [GDT_ENTRY_APMBIOS_BASE] = + { 0x0000ffff, 0x00409a00 }, /* APM CS code */ + { 0x0000ffff, 0x00009a00 }, /* APM CS 16 code (16 bit) */ + { 0x0000ffff, 0x00409200 }, /* APM DS data */ + + /* ESPFIX 16-bit SS */ + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00009200 }, + /* FIXME: We save/restore %gs even on UP: fix entry.S. */ + [GDT_ENTRY_PERCPU] = { 0x0000ffff, 0x00cf9200 }, +}; + +/* Early in boot we use the master per-cpu gdt_table directly. */ +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr) += { .size = GDT_ENTRIES*8-1, .address = (long)&per_cpu__cpu_gdt_table }; +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); + static __init int add_pcspkr(void) { struct platform_device *pd; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/arch/i386/kernel/smpboot.c working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/smpboot.c --- linux-2.6.18-rc6-mm2/arch/i386/kernel/smpboot.c 2006-09-19 14:54:24.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/arch/i386/kernel/smpboot.c 2006-09-19 15:26:28.000000000 +1000 @@ -60,6 +60,9 @@ /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; +DEFINE_PER_CPU(unsigned int, processor_id); +EXPORT_PER_CPU_SYMBOL(processor_id); + /* Number of siblings per CPU package */ int smp_num_siblings = 1; #ifdef CONFIG_X86_HT @@ -104,6 +107,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid); u8 apicid_2_node[MAX_APICID]; +DEFINE_PER_CPU(unsigned long, this_cpu_off); +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + /* * Trampoline 80x86 program as an array. */ @@ -934,14 +940,6 @@ static int __devinit do_boot_cpu(int api unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - /* Pre-allocate the CPU's GDT and PDA so it doesn't have to do - any memory allocation during the delicate CPU-bringup - phase. */ - if (!alloc_gdt(cpu)) { - printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); - return -1; /* ? */ - } - ++cpucount; alternatives_smp_switch(1); @@ -1072,7 +1070,6 @@ static int __cpuinit __smp_prepare_cpu(i struct warm_boot_cpu_info info; struct work_struct task; int apicid, ret; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -1080,18 +1077,6 @@ static int __cpuinit __smp_prepare_cpu(i goto exit; } - /* - * the CPU isn't initialized at boot time, allocate gdt table here. - * cpu_init will initialize it - */ - if (!cpu_gdt_descr->address) { - cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); - if (!cpu_gdt_descr->address) - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - ret = -ENOMEM; - goto exit; - } - info.complete = &done; info.apicid = apicid; info.cpu = cpu; @@ -1330,6 +1315,37 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +static inline void set_kernel_gs(void) +{ + /* Set %gs for this CPU's per-cpu area. Memory clobber is to create a + barrier with respect to any per-cpu operations, so the compiler + doesn't move any before here. */ + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PERCPU) : "memory"); +} + +static __cpuinit void setup_percpu_descriptor(struct desc_struct *gdt, + unsigned long per_cpu_off) +{ + unsigned limit, flags; + + limit = (1 << 20); + flags = 0x8; /* 4k granularity */ + + /* present read-write data segment */ + pack_descriptor((u32 *)&gdt->a, (u32 *)&gdt->b, + per_cpu_off, limit - 1, + 0x80 | DESCTYPE_S | 0x2, flags); +} + +/* Set up a very early per-cpu for the boot CPU so that smp_processor_id() + and current will work. */ +void __init smp_setup_processor_id(void) +{ + /* We use the per-cpu template area (__per_cpu_offset[0] == 0). */ + __per_cpu_offset[0] = 0; + setup_percpu_for_this_cpu(0); +} + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) @@ -1340,8 +1356,26 @@ void __init smp_prepare_cpus(unsigned in smp_boot_cpus(max_cpus); } +/* Be careful not to use %gs references until this is setup: needs to + * be done on this CPU. */ +void __init setup_percpu_for_this_cpu(unsigned int cpu) +{ + struct desc_struct *gdt = per_cpu(cpu_gdt_table, cpu); + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + per_cpu(processor_id, cpu) = cpu; + per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; + setup_percpu_descriptor(&gdt[GDT_ENTRY_PERCPU], __per_cpu_offset[cpu]); + cpu_gdt_descr->address = (unsigned long)gdt; + cpu_gdt_descr->size = GDT_SIZE - 1; + load_gdt(cpu_gdt_descr); + set_kernel_gs(); +} + void __devinit smp_prepare_boot_cpu(void) { + setup_percpu_for_this_cpu(0); + cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); cpu_set(smp_processor_id(), cpu_present_map); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/current.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/current.h --- linux-2.6.18-rc6-mm2/include/asm-i386/current.h 2006-09-19 14:55:55.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/current.h 2006-09-19 15:26:28.000000000 +1000 @@ -2,7 +2,7 @@ #define _I386_CURRENT_H #include <linux/thread_info.h> -#include <asm/pda.h> +#include <asm/percpu.h> struct task_struct; @@ -11,11 +11,7 @@ static __always_inline struct task_struc return current_thread_info()->task; } -static __always_inline struct task_struct *get_current(void) -{ - return read_pda(pcurrent); -} - -#define current get_current() +DECLARE_PER_CPU(struct task_struct *, current_task); +#define current x86_read_percpu(current_task) #endif /* !(_I386_CURRENT_H) */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/desc.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/desc.h --- linux-2.6.18-rc6-mm2/include/asm-i386/desc.h 2006-09-19 14:55:55.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/desc.h 2006-09-19 15:23:48.000000000 +1000 @@ -14,8 +14,8 @@ #include <asm/mmu.h> -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; - +DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); struct Xgt_desc_struct { @@ -25,8 +25,6 @@ struct Xgt_desc_struct { } __attribute__ ((packed)); extern struct Xgt_desc_struct idt_descr; -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); - static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/pda.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/pda.h --- linux-2.6.18-rc6-mm2/include/asm-i386/pda.h 2006-09-19 14:55:56.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/pda.h 1970-01-01 10:00:00.000000000 +1000 @@ -1,68 +0,0 @@ -#ifndef _I386_PDA_H -#define _I386_PDA_H - -struct i386_pda -{ - struct task_struct *pcurrent; /* current process */ - int cpu_number; -}; - -extern struct i386_pda *_cpu_pda[]; - -#define cpu_pda(i) (_cpu_pda[i]) - -#define pda_offset(field) offsetof(struct i386_pda, field) - -extern void __bad_pda_field(void); - -extern struct i386_pda _proxy_pda; - -#define pda_to_op(op,field,val) \ - do { \ - typedef typeof(_proxy_pda.field) T__; \ - if (0) { T__ tmp__; tmp__ = (val); } \ - switch (sizeof(_proxy_pda.field)) { \ - case 2: \ - asm(op "w %1,%%gs:%c2" \ - : "+m" (_proxy_pda.field) \ - :"ri" ((T__)val), \ - "i"(pda_offset(field))); \ - break; \ - case 4: \ - asm(op "l %1,%%gs:%c2" \ - : "+m" (_proxy_pda.field) \ - :"ri" ((T__)val), \ - "i"(pda_offset(field))); \ - break; \ - default: __bad_pda_field(); \ - } \ - } while (0) - -#define pda_from_op(op,field) \ - ({ \ - typeof(_proxy_pda.field) ret__; \ - switch (sizeof(_proxy_pda.field)) { \ - case 2: \ - asm(op "w %%gs:%c1,%0" \ - : "=r" (ret__) \ - : "i" (pda_offset(field)), \ - "m" (_proxy_pda.field)); \ - break; \ - case 4: \ - asm(op "l %%gs:%c1,%0" \ - : "=r" (ret__) \ - : "i" (pda_offset(field)), \ - "m" (_proxy_pda.field)); \ - break; \ - default: __bad_pda_field(); \ - } \ - ret__; }) - - -#define read_pda(field) pda_from_op("mov",field) -#define write_pda(field,val) pda_to_op("mov",field,val) -#define add_pda(field,val) pda_to_op("add",field,val) -#define sub_pda(field,val) pda_to_op("sub",field,val) -#define or_pda(field,val) pda_to_op("or",field,val) - -#endif /* _I386_PDA_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/percpu.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/percpu.h --- linux-2.6.18-rc6-mm2/include/asm-i386/percpu.h 2004-02-04 14:44:44.000000000 +1100 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/percpu.h 2006-09-19 15:26:28.000000000 +1000 @@ -1,6 +1,107 @@ #ifndef __ARCH_I386_PERCPU__ #define __ARCH_I386_PERCPU__ +#ifdef CONFIG_SMP +/* Same as generic implementation except for optimized local access. */ +#define __GENERIC_PER_CPU + +/* This is used for other cpus to find our section. */ +extern unsigned long __per_cpu_offset[NR_CPUS]; + +/* Separate out the type, so (int[3], foo) works. */ +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU(type, name) \ + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name + +/* We can use this directly for local CPU (faster). */ +DECLARE_PER_CPU(unsigned long, this_cpu_off); + +/* var is in discarded region: offset to particular copy we want */ +#define per_cpu(var, cpu) (*({ \ + extern int simple_indentifier_##var(void); \ + RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); })) + +#define __raw_get_cpu_var(var) (*({ \ + extern int simple_indentifier_##var(void); \ + RELOC_HIDE(&per_cpu__##var, x86_read_percpu(this_cpu_off)); \ +})) + +#define __get_cpu_var(var) __raw_get_cpu_var(var) + +/* A macro to avoid #include hell... */ +#define percpu_modcopy(pcpudst, src, size) \ +do { \ + unsigned int __i; \ + for_each_possible_cpu(__i) \ + memcpy((pcpudst)+__per_cpu_offset[__i], \ + (src), (size)); \ +} while (0) + +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) + +/* gs segment starts at (positive) offset == __per_cpu_offset[cpu] */ +#define __percpu_seg "%%gs:" +#else /* !SMP */ #include <asm-generic/percpu.h> +#define __percpu_seg "" +#endif /* SMP */ + +/* For arch-specific code, we can use direct single-insn ops (they + * don't give an lvalue though). */ +extern void __bad_percpu_size(void); + +#define percpu_to_op(op,var,val) \ + do { \ + typedef typeof(var) T__; \ + if (0) { T__ tmp__; tmp__ = (val); } \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b %1,"__percpu_seg"%0" \ + : "+m" (var) \ + :"ri" ((T__)val)); \ + break; \ + case 2: \ + asm(op "w %1,"__percpu_seg"%0" \ + : "+m" (var) \ + :"ri" ((T__)val)); \ + break; \ + case 4: \ + asm(op "l %1,"__percpu_seg"%0" \ + : "+m" (var) \ + :"ri" ((T__)val)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + } while (0) + +#define percpu_from_op(op,var) \ + ({ \ + typeof(var) ret__; \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 2: \ + asm(op "w "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 4: \ + asm(op "l "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + ret__; }) + +#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) +#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val) +#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val) +#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val) +#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val) #endif /* __ARCH_I386_PERCPU__ */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/processor.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/processor.h --- linux-2.6.18-rc6-mm2/include/asm-i386/processor.h 2006-09-19 14:55:56.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/processor.h 2006-09-19 15:26:28.000000000 +1000 @@ -473,7 +473,7 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .gs = __KERNEL_PDA, \ + .gs = __KERNEL_PERCPU, \ } /* @@ -728,6 +728,5 @@ extern void select_idle_routine(const st extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); -extern int alloc_gdt(int cpu); #endif /* __ASM_I386_PROCESSOR_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/segment.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/segment.h --- linux-2.6.18-rc6-mm2/include/asm-i386/segment.h 2006-09-19 14:55:56.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/segment.h 2006-09-19 15:26:28.000000000 +1000 @@ -39,7 +39,7 @@ * 25 - APM BIOS support * * 26 - ESPFIX small SS - * 27 - PDA [ per-cpu private data area ] + * 27 - PERCPU [ offset segment for per-cpu area ] * 28 - unused * 29 - unused * 30 - unused @@ -74,8 +74,8 @@ #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15) -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8) +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) #define GDT_ENTRY_DOUBLEFAULT_TSS 31 diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/smp.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/smp.h --- linux-2.6.18-rc6-mm2/include/asm-i386/smp.h 2006-09-19 14:55:56.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/smp.h 2006-09-19 15:27:59.000000000 +1000 @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/cpumask.h> -#include <asm/pda.h> +#include <asm/percpu.h> #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -59,7 +59,8 @@ extern void cpu_uninit(void); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (read_pda(cpu_number)) +DECLARE_PER_CPU(unsigned int, processor_id); +#define raw_smp_processor_id() (x86_read_percpu(processor_id)) /* This is valid from the very earliest point in boot that we care about. */ #define early_smp_processor_id() (current_thread_info()->cpu) @@ -93,6 +94,8 @@ extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern unsigned int num_processors; +void setup_percpu_for_this_cpu(unsigned int cpu); + #endif /* !__ASSEMBLY__ */ #else /* CONFIG_SMP */ @@ -100,6 +103,7 @@ extern unsigned int num_processors; #define safe_smp_processor_id() 0 #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define early_smp_processor_id() 0 +#define setup_percpu_for_this_cpu(cpu) #define NO_PROC_ID 0xFF /* No processor magic marker */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.18-rc6-mm2/include/asm-i386/unwind.h working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/unwind.h --- linux-2.6.18-rc6-mm2/include/asm-i386/unwind.h 2006-09-19 14:55:56.000000000 +1000 +++ working-2.6.18-rc6-mm2-pda-to-percpu/include/asm-i386/unwind.h 2006-09-19 15:26:28.000000000 +1000 @@ -65,7 +65,7 @@ static inline void arch_unw_init_blocked info->regs.xss = __KERNEL_DS; info->regs.xds = __USER_DS; info->regs.xes = __USER_DS; - info->regs.xgs = __KERNEL_PDA; + info->regs.xgs = __KERNEL_PERCPU; } extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *, -- Help! Save Australia from the worst of the DMCA: http://linux.org.au/law