NAK. We really should map the GDT read-only on all 64 bit systems, since we can't hide the address from SLDT. Same with the IDT. On September 26, 2015 11:00:40 AM PDT, Denys Vlasenko <dvlasenk@xxxxxxxxxx> wrote: >We have our GDT in a page-sized per-cpu structure, gdt_page. > >On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used. > >It is page-sized because of paravirt. Hypervisors need to know when >GDT is changed, so they remap it read-only and handle write faults. >If it's not in its own page, other writes nearby will cause >those faults too. > >In other words, we need GDT to live in a separate page >only if CONFIG_HYPERVISOR_GUEST=y. > >This patch reduces GDT alignment to cacheline-aligned >if CONFIG_HYPERVISOR_GUEST is not set. > >Patch also renames gdt_page to cpu_gdt (mimicking naming of existing >cpu_tss per-cpu variable), since now it is not always a full page. > > $ objdump -x vmlinux | grep .data..percpu | sort >Before: > (offset) (size) >0000000000000000 w O .data..percpu 0000000000004000 >irq_stack_union >0000000000004000 w O .data..percpu 0000000000005000 >exception_stacks >0000000000009000 w O .data..percpu 0000000000001000 gdt_page <<< >HERE > 000000000000a000 w O .data..percpu 0000000000000008 espfix_waddr > 000000000000a008 w O .data..percpu 0000000000000008 espfix_stack > ... > 0000000000019398 g .data..percpu 0000000000000000 __per_cpu_end >After: >0000000000000000 w O .data..percpu 0000000000004000 >irq_stack_union >0000000000004000 w O .data..percpu 0000000000005000 >exception_stacks > 0000000000009000 w O .data..percpu 0000000000000008 espfix_waddr > 0000000000009008 w O .data..percpu 0000000000000008 espfix_stack > ... > 0000000000013c80 w O .data..percpu 0000000000000040 cyc2ns > 0000000000013cc0 w O .data..percpu 00000000000022c0 cpu_tss >0000000000015f80 w O .data..percpu 0000000000000080 cpu_gdt <<< >HERE > 0000000000016000 w O .data..percpu 0000000000000018 cpu_tlbstate > ... > 0000000000018418 g .data..percpu 0000000000000000 __per_cpu_end > >Run-tested on a 144 CPU machine. > >Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx> >CC: Ingo Molnar <mingo@xxxxxxxxxx> >CC: H. Peter Anvin <hpa@xxxxxxxxx> >CC: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> >CC: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx> >CC: David Vrabel <david.vrabel@xxxxxxxxxx> >CC: Joerg Roedel <joro@xxxxxxxxxx> >CC: Gleb Natapov <gleb@xxxxxxxxxx> >CC: Paolo Bonzini <pbonzini@xxxxxxxxxx> >CC: kvm@xxxxxxxxxxxxxxx >CC: x86@xxxxxxxxxx >CC: linux-kernel@xxxxxxxxxxxxxxx >--- > arch/x86/entry/entry_32.S | 2 +- > arch/x86/include/asm/desc.h | 16 +++++++++++----- > arch/x86/kernel/cpu/common.c | 10 ++++++++-- > arch/x86/kernel/cpu/perf_event.c | 2 +- > arch/x86/kernel/head_32.S | 4 ++-- > arch/x86/kernel/head_64.S | 2 +- > arch/x86/kernel/vmlinux.lds.S | 2 +- > arch/x86/tools/relocs.c | 2 +- > arch/x86/xen/enlighten.c | 4 ++-- > 9 files changed, 28 insertions(+), 16 deletions(-) > >diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S >index b2909bf..bc6ae1c 100644 >--- a/arch/x86/entry/entry_32.S >+++ b/arch/x86/entry/entry_32.S >@@ -429,7 +429,7 @@ ldt_ss: > * compensating for the offset by changing to the ESPFIX segment with > * a base address that matches for the difference. > */ >-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * >8) >+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8) > mov %esp, %edx /* load kernel esp */ > mov PT_OLDESP(%esp), %eax /* load userspace esp */ > mov %dx, %ax /* eax: new kernel esp */ >diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h >index 4e10d73..76de300 100644 >--- a/arch/x86/include/asm/desc.h >+++ b/arch/x86/include/asm/desc.h >@@ -39,15 +39,21 @@ extern gate_desc idt_table[]; > extern struct desc_ptr debug_idt_descr; > extern gate_desc debug_idt_table[]; > >-struct gdt_page { >+struct cpu_gdt { > struct desc_struct gdt[GDT_ENTRIES]; >-} __attribute__((aligned(PAGE_SIZE))); >- >-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); >+} >+#ifdef CONFIG_HYPERVISOR_GUEST >+/* Xen et al want GDT to have its own page. They remap it read-only */ >+__attribute__((aligned(PAGE_SIZE))); >+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt); >+#else >+____cacheline_aligned; >+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt); >+#endif > > static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) > { >- return per_cpu(gdt_page, cpu).gdt; >+ return per_cpu(cpu_gdt, cpu).gdt; > } > > #ifdef CONFIG_X86_64 >diff --git a/arch/x86/kernel/cpu/common.c >b/arch/x86/kernel/cpu/common.c >index de22ea7..6b90785 100644 >--- a/arch/x86/kernel/cpu/common.c >+++ b/arch/x86/kernel/cpu/common.c >@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = { > > static const struct cpu_dev *this_cpu = &default_cpu; > >-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { >+#ifdef CONFIG_HYPERVISOR_GUEST >+/* Xen et al want GDT to have its own page. They remap it read-only */ >+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) = >+#else >+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) = >+#endif >+{ .gdt = { > #ifdef CONFIG_X86_64 > /* > * We need valid kernel segments for data and code in long mode too >@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, >gdt_page) = { .gdt = { > GDT_STACK_CANARY_INIT > #endif > } }; >-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); >+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt); > > static int __init x86_mpx_setup(char *s) > { >diff --git a/arch/x86/kernel/cpu/perf_event.c >b/arch/x86/kernel/cpu/perf_event.c >index 66dd3fe9..41ebc94 100644 >--- a/arch/x86/kernel/cpu/perf_event.c >+++ b/arch/x86/kernel/cpu/perf_event.c >@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned >int segment) > if (idx > GDT_ENTRIES) > return 0; > >- desc = raw_cpu_ptr(gdt_page.gdt) + idx; >+ desc = raw_cpu_ptr(cpu_gdt.gdt) + idx; > } > > return get_desc_base(desc); >diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S >index 0e2d96f..33c9d10 100644 >--- a/arch/x86/kernel/head_32.S >+++ b/arch/x86/kernel/head_32.S >@@ -521,7 +521,7 @@ setup_once: > * relocation. Manually set base address in stack canary > * segment descriptor. > */ >- movl $gdt_page,%eax >+ movl $cpu_gdt,%eax > movl $stack_canary,%ecx > movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) > shrl $16, %ecx >@@ -758,7 +758,7 @@ idt_descr: > .word 0 # 32 bit align gdt_desc.address > ENTRY(early_gdt_descr) > .word GDT_ENTRIES*8-1 >- .long gdt_page /* Overwritten for secondary CPUs */ >+ .long cpu_gdt /* Overwritten for secondary CPUs */ > > /* > * The boot_gdt must mirror the equivalent in setup.S and is >diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S >index 1d40ca8..5a80e8c 100644 >--- a/arch/x86/kernel/head_64.S >+++ b/arch/x86/kernel/head_64.S >@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt) > early_gdt_descr: > .word GDT_ENTRIES*8-1 > early_gdt_descr_base: >- .quad INIT_PER_CPU_VAR(gdt_page) >+ .quad INIT_PER_CPU_VAR(cpu_gdt) > > ENTRY(phys_base) > /* This must match the first entry in level2_kernel_pgt */ >diff --git a/arch/x86/kernel/vmlinux.lds.S >b/arch/x86/kernel/vmlinux.lds.S >index 74e4bf1..198e46b 100644 >--- a/arch/x86/kernel/vmlinux.lds.S >+++ b/arch/x86/kernel/vmlinux.lds.S >@@ -348,7 +348,7 @@ SECTIONS > * for the boot processor. > */ > #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load >-INIT_PER_CPU(gdt_page); >+INIT_PER_CPU(cpu_gdt); > INIT_PER_CPU(irq_stack_union); > > /* >diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c >index 0c2fae8..5a1da82 100644 >--- a/arch/x86/tools/relocs.c >+++ b/arch/x86/tools/relocs.c >@@ -736,7 +736,7 @@ static void percpu_init(void) > * > * The "gold" linker incorrectly associates: > * init_per_cpu__irq_stack_union >- * init_per_cpu__gdt_page >+ * init_per_cpu__cpu_gdt > */ > static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) > { >diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c >index 30d12af..baecfc6 100644 >--- a/arch/x86/xen/enlighten.c >+++ b/arch/x86/xen/enlighten.c >@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init >xen_start_kernel(void) > > /* > * The only reliable way to retain the initial address of the >- * percpu gdt_page is to remember it here, so we can go and >+ * percpu cpu_gdt is to remember it here, so we can go and > * mark it RW later, when the initial percpu area is freed. > */ >- xen_initial_gdt = &per_cpu(gdt_page, 0); >+ xen_initial_gdt = &per_cpu(cpu_gdt, 0); > > xen_smp_init(); > -- Sent from my Android device with K-9 Mail. Please excuse my brevity. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html