Re: [PATCH] x86: Use entire page for the per-cpu GDT only if paravirt-enabled

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



NAK.  We really should map the GDT read-only on all 64 bit systems, since we can't hide the address from SLDT.  Same with the IDT.

On September 26, 2015 11:00:40 AM PDT, Denys Vlasenko <dvlasenk@xxxxxxxxxx> wrote:
>We have our GDT in a page-sized per-cpu structure, gdt_page.
>
>On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.
>
>It is page-sized because of paravirt. Hypervisors need to know when
>GDT is changed, so they remap it read-only and handle write faults.
>If it's not in its own page, other writes nearby will cause
>those faults too.
>
>In other words, we need GDT to live in a separate page
>only if CONFIG_HYPERVISOR_GUEST=y.
>
>This patch reduces GDT alignment to cacheline-aligned
>if CONFIG_HYPERVISOR_GUEST is not set.
>
>Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
>cpu_tss per-cpu variable), since now it is not always a full page.
>
>    $ objdump -x vmlinux | grep .data..percpu | sort
>Before:
>    (offset)                                (size)
>0000000000000000  w    O .data..percpu  0000000000004000
>irq_stack_union
>0000000000004000  w    O .data..percpu  0000000000005000
>exception_stacks
>0000000000009000  w    O .data..percpu  0000000000001000 gdt_page  <<<
>HERE
>  000000000000a000  w    O .data..percpu  0000000000000008 espfix_waddr
>  000000000000a008  w    O .data..percpu  0000000000000008 espfix_stack
>    ...
> 0000000000019398 g       .data..percpu  0000000000000000 __per_cpu_end
>After:
>0000000000000000  w    O .data..percpu  0000000000004000
>irq_stack_union
>0000000000004000  w    O .data..percpu  0000000000005000
>exception_stacks
>  0000000000009000  w    O .data..percpu  0000000000000008 espfix_waddr
>  0000000000009008  w    O .data..percpu  0000000000000008 espfix_stack
>    ...
>    0000000000013c80  w    O .data..percpu  0000000000000040 cyc2ns
>    0000000000013cc0  w    O .data..percpu  00000000000022c0 cpu_tss
>0000000000015f80  w    O .data..percpu  0000000000000080 cpu_gdt  <<<
>HERE
>  0000000000016000  w    O .data..percpu  0000000000000018 cpu_tlbstate
>    ...
> 0000000000018418 g       .data..percpu  0000000000000000 __per_cpu_end
>
>Run-tested on a 144 CPU machine.
>
>Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
>CC: Ingo Molnar <mingo@xxxxxxxxxx>
>CC: H. Peter Anvin <hpa@xxxxxxxxx>
>CC: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
>CC: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
>CC: David Vrabel <david.vrabel@xxxxxxxxxx>
>CC: Joerg Roedel <joro@xxxxxxxxxx>
>CC: Gleb Natapov <gleb@xxxxxxxxxx>
>CC: Paolo Bonzini <pbonzini@xxxxxxxxxx>
>CC: kvm@xxxxxxxxxxxxxxx
>CC: x86@xxxxxxxxxx
>CC: linux-kernel@xxxxxxxxxxxxxxx
>---
> arch/x86/entry/entry_32.S        |  2 +-
> arch/x86/include/asm/desc.h      | 16 +++++++++++-----
> arch/x86/kernel/cpu/common.c     | 10 ++++++++--
> arch/x86/kernel/cpu/perf_event.c |  2 +-
> arch/x86/kernel/head_32.S        |  4 ++--
> arch/x86/kernel/head_64.S        |  2 +-
> arch/x86/kernel/vmlinux.lds.S    |  2 +-
> arch/x86/tools/relocs.c          |  2 +-
> arch/x86/xen/enlighten.c         |  4 ++--
> 9 files changed, 28 insertions(+), 16 deletions(-)
>
>diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
>index b2909bf..bc6ae1c 100644
>--- a/arch/x86/entry/entry_32.S
>+++ b/arch/x86/entry/entry_32.S
>@@ -429,7 +429,7 @@ ldt_ss:
>  * compensating for the offset by changing to the ESPFIX segment with
>  * a base address that matches for the difference.
>  */
>-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS *
>8)
>+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
> 	mov	%esp, %edx			/* load kernel esp */
> 	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
> 	mov	%dx, %ax			/* eax: new kernel esp */
>diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
>index 4e10d73..76de300 100644
>--- a/arch/x86/include/asm/desc.h
>+++ b/arch/x86/include/asm/desc.h
>@@ -39,15 +39,21 @@ extern gate_desc idt_table[];
> extern struct desc_ptr debug_idt_descr;
> extern gate_desc debug_idt_table[];
> 
>-struct gdt_page {
>+struct cpu_gdt {
> 	struct desc_struct gdt[GDT_ENTRIES];
>-} __attribute__((aligned(PAGE_SIZE)));
>-
>-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
>+}
>+#ifdef CONFIG_HYPERVISOR_GUEST
>+/* Xen et al want GDT to have its own page. They remap it read-only */
>+__attribute__((aligned(PAGE_SIZE)));
>+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
>+#else
>+____cacheline_aligned;
>+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
>+#endif
> 
> static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
> {
>-	return per_cpu(gdt_page, cpu).gdt;
>+	return per_cpu(cpu_gdt, cpu).gdt;
> }
> 
> #ifdef CONFIG_X86_64
>diff --git a/arch/x86/kernel/cpu/common.c
>b/arch/x86/kernel/cpu/common.c
>index de22ea7..6b90785 100644
>--- a/arch/x86/kernel/cpu/common.c
>+++ b/arch/x86/kernel/cpu/common.c
>@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = {
> 
> static const struct cpu_dev *this_cpu = &default_cpu;
> 
>-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
>+#ifdef CONFIG_HYPERVISOR_GUEST
>+/* Xen et al want GDT to have its own page. They remap it read-only */
>+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
>+#else
>+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
>+#endif
>+{ .gdt = {
> #ifdef CONFIG_X86_64
> 	/*
> 	 * We need valid kernel segments for data and code in long mode too
>@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page,
>gdt_page) = { .gdt = {
> 	GDT_STACK_CANARY_INIT
> #endif
> } };
>-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
>+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
> 
> static int __init x86_mpx_setup(char *s)
> {
>diff --git a/arch/x86/kernel/cpu/perf_event.c
>b/arch/x86/kernel/cpu/perf_event.c
>index 66dd3fe9..41ebc94 100644
>--- a/arch/x86/kernel/cpu/perf_event.c
>+++ b/arch/x86/kernel/cpu/perf_event.c
>@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned
>int segment)
> 		if (idx > GDT_ENTRIES)
> 			return 0;
> 
>-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
>+		desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
> 	}
> 
> 	return get_desc_base(desc);
>diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
>index 0e2d96f..33c9d10 100644
>--- a/arch/x86/kernel/head_32.S
>+++ b/arch/x86/kernel/head_32.S
>@@ -521,7 +521,7 @@ setup_once:
> 	 * relocation.  Manually set base address in stack canary
> 	 * segment descriptor.
> 	 */
>-	movl $gdt_page,%eax
>+	movl $cpu_gdt,%eax
> 	movl $stack_canary,%ecx
> 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
> 	shrl $16, %ecx
>@@ -758,7 +758,7 @@ idt_descr:
> 	.word 0				# 32 bit align gdt_desc.address
> ENTRY(early_gdt_descr)
> 	.word GDT_ENTRIES*8-1
>-	.long gdt_page			/* Overwritten for secondary CPUs */
>+	.long cpu_gdt			/* Overwritten for secondary CPUs */
> 
> /*
>  * The boot_gdt must mirror the equivalent in setup.S and is
>diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
>index 1d40ca8..5a80e8c 100644
>--- a/arch/x86/kernel/head_64.S
>+++ b/arch/x86/kernel/head_64.S
>@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt)
> early_gdt_descr:
> 	.word	GDT_ENTRIES*8-1
> early_gdt_descr_base:
>-	.quad	INIT_PER_CPU_VAR(gdt_page)
>+	.quad	INIT_PER_CPU_VAR(cpu_gdt)
> 
> ENTRY(phys_base)
> 	/* This must match the first entry in level2_kernel_pgt */
>diff --git a/arch/x86/kernel/vmlinux.lds.S
>b/arch/x86/kernel/vmlinux.lds.S
>index 74e4bf1..198e46b 100644
>--- a/arch/x86/kernel/vmlinux.lds.S
>+++ b/arch/x86/kernel/vmlinux.lds.S
>@@ -348,7 +348,7 @@ SECTIONS
>  * for the boot processor.
>  */
> #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
>-INIT_PER_CPU(gdt_page);
>+INIT_PER_CPU(cpu_gdt);
> INIT_PER_CPU(irq_stack_union);
> 
> /*
>diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
>index 0c2fae8..5a1da82 100644
>--- a/arch/x86/tools/relocs.c
>+++ b/arch/x86/tools/relocs.c
>@@ -736,7 +736,7 @@ static void percpu_init(void)
>  *
>  * The "gold" linker incorrectly associates:
>  *	init_per_cpu__irq_stack_union
>- *	init_per_cpu__gdt_page
>+ *	init_per_cpu__cpu_gdt
>  */
> static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
> {
>diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>index 30d12af..baecfc6 100644
>--- a/arch/x86/xen/enlighten.c
>+++ b/arch/x86/xen/enlighten.c
>@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init
>xen_start_kernel(void)
> 
> 	/*
> 	 * The only reliable way to retain the initial address of the
>-	 * percpu gdt_page is to remember it here, so we can go and
>+	 * percpu cpu_gdt is to remember it here, so we can go and
> 	 * mark it RW later, when the initial percpu area is freed.
> 	 */
>-	xen_initial_gdt = &per_cpu(gdt_page, 0);
>+	xen_initial_gdt = &per_cpu(cpu_gdt, 0);
> 
> 	xen_smp_init();
> 

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux