We have our GDT in a page-sized per-cpu structure, gdt_page. On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used. It is page-sized because of paravirt. Hypervisors need to know when GDT is changed, so they remap it read-only and handle write faults. If it's not in its own page, other writes nearby will cause those faults too. In other words, we need GDT to live in a separate page only if CONFIG_HYPERVISOR_GUEST=y. This patch reduces GDT alignment to cacheline-aligned if CONFIG_HYPERVISOR_GUEST is not set. Patch also renames gdt_page to cpu_gdt (mimicking naming of existing cpu_tss per-cpu variable), since now it is not always a full page. $ objdump -x vmlinux | grep .data..percpu | sort Before: (offset) (size) 0000000000000000 w O .data..percpu 0000000000004000 irq_stack_union 0000000000004000 w O .data..percpu 0000000000005000 exception_stacks 0000000000009000 w O .data..percpu 0000000000001000 gdt_page <<< HERE 000000000000a000 w O .data..percpu 0000000000000008 espfix_waddr 000000000000a008 w O .data..percpu 0000000000000008 espfix_stack ... 0000000000019398 g .data..percpu 0000000000000000 __per_cpu_end After: 0000000000000000 w O .data..percpu 0000000000004000 irq_stack_union 0000000000004000 w O .data..percpu 0000000000005000 exception_stacks 0000000000009000 w O .data..percpu 0000000000000008 espfix_waddr 0000000000009008 w O .data..percpu 0000000000000008 espfix_stack ... 0000000000013c80 w O .data..percpu 0000000000000040 cyc2ns 0000000000013cc0 w O .data..percpu 00000000000022c0 cpu_tss 0000000000015f80 w O .data..percpu 0000000000000080 cpu_gdt <<< HERE 0000000000016000 w O .data..percpu 0000000000000018 cpu_tlbstate ... 0000000000018418 g .data..percpu 0000000000000000 __per_cpu_end Run-tested on a 144 CPU machine. Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx> CC: Ingo Molnar <mingo@xxxxxxxxxx> CC: H. Peter Anvin <hpa@xxxxxxxxx> CC: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> CC: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx> CC: David Vrabel <david.vrabel@xxxxxxxxxx> CC: Joerg Roedel <joro@xxxxxxxxxx> CC: Gleb Natapov <gleb@xxxxxxxxxx> CC: Paolo Bonzini <pbonzini@xxxxxxxxxx> CC: kvm@xxxxxxxxxxxxxxx CC: x86@xxxxxxxxxx CC: linux-kernel@xxxxxxxxxxxxxxx --- arch/x86/entry/entry_32.S | 2 +- arch/x86/include/asm/desc.h | 16 +++++++++++----- arch/x86/kernel/cpu/common.c | 10 ++++++++-- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/head_32.S | 4 ++-- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/vmlinux.lds.S | 2 +- arch/x86/tools/relocs.c | 2 +- arch/x86/xen/enlighten.c | 4 ++-- 9 files changed, 28 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b2909bf..bc6ae1c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -429,7 +429,7 @@ ldt_ss: * compensating for the offset by changing to the ESPFIX segment with * a base address that matches for the difference. */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8) mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4e10d73..76de300 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -39,15 +39,21 @@ extern gate_desc idt_table[]; extern struct desc_ptr debug_idt_descr; extern gate_desc debug_idt_table[]; -struct gdt_page { +struct cpu_gdt { struct desc_struct gdt[GDT_ENTRIES]; -} __attribute__((aligned(PAGE_SIZE))); - -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +} +#ifdef CONFIG_HYPERVISOR_GUEST +/* Xen et al want GDT to have its own page. They remap it read-only */ +__attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt); +#else +____cacheline_aligned; +DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt); +#endif static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { - return per_cpu(gdt_page, cpu).gdt; + return per_cpu(cpu_gdt, cpu).gdt; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index de22ea7..6b90785 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = { static const struct cpu_dev *this_cpu = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +#ifdef CONFIG_HYPERVISOR_GUEST +/* Xen et al want GDT to have its own page. They remap it read-only */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) = +#else +DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) = +#endif +{ .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { GDT_STACK_CANARY_INIT #endif } }; -EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt); static int __init x86_mpx_setup(char *s) { diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66dd3fe9..41ebc94 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned int segment) if (idx > GDT_ENTRIES) return 0; - desc = raw_cpu_ptr(gdt_page.gdt) + idx; + desc = raw_cpu_ptr(cpu_gdt.gdt) + idx; } return get_desc_base(desc); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0e2d96f..33c9d10 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -521,7 +521,7 @@ setup_once: * relocation. Manually set base address in stack canary * segment descriptor. */ - movl $gdt_page,%eax + movl $cpu_gdt,%eax movl $stack_canary,%ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx @@ -758,7 +758,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long gdt_page /* Overwritten for secondary CPUs */ + .long cpu_gdt /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8..5a80e8c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt) early_gdt_descr: .word GDT_ENTRIES*8-1 early_gdt_descr_base: - .quad INIT_PER_CPU_VAR(gdt_page) + .quad INIT_PER_CPU_VAR(cpu_gdt) ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf1..198e46b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -348,7 +348,7 @@ SECTIONS * for the boot processor. */ #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load -INIT_PER_CPU(gdt_page); +INIT_PER_CPU(cpu_gdt); INIT_PER_CPU(irq_stack_union); /* diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0c2fae8..5a1da82 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -736,7 +736,7 @@ static void percpu_init(void) * * The "gold" linker incorrectly associates: * init_per_cpu__irq_stack_union - * init_per_cpu__gdt_page + * init_per_cpu__cpu_gdt */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) { diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 30d12af..baecfc6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1592,10 +1592,10 @@ asmlinkage __visible void __init xen_start_kernel(void) /* * The only reliable way to retain the initial address of the - * percpu gdt_page is to remember it here, so we can go and + * percpu cpu_gdt is to remember it here, so we can go and * mark it RW later, when the initial percpu area is freed. */ - xen_initial_gdt = &per_cpu(gdt_page, 0); + xen_initial_gdt = &per_cpu(cpu_gdt, 0); xen_smp_init(); -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html