[PATCH] x86: Use entire page for the per-cpu GDT only if paravirt-enabled

Denys Vlasenko <dvlasenk@xxxxxxxxxx> · Sat, 26 Sep 2015 20:00:40 +0200

We have our GDT in a page-sized per-cpu structure, gdt_page.

On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.

It is page-sized because of paravirt. Hypervisors need to know when
GDT is changed, so they remap it read-only and handle write faults.
If it's not in its own page, other writes nearby will cause
those faults too.

In other words, we need GDT to live in a separate page
only if CONFIG_HYPERVISOR_GUEST=y.

This patch reduces GDT alignment to cacheline-aligned
if CONFIG_HYPERVISOR_GUEST is not set.

Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
cpu_tss per-cpu variable), since now it is not always a full page.

    $ objdump -x vmlinux | grep .data..percpu | sort
Before:
    (offset)                                (size)
    0000000000000000  w    O .data..percpu  0000000000004000 irq_stack_union
    0000000000004000  w    O .data..percpu  0000000000005000 exception_stacks
    0000000000009000  w    O .data..percpu  0000000000001000 gdt_page  <<< HERE
    000000000000a000  w    O .data..percpu  0000000000000008 espfix_waddr
    000000000000a008  w    O .data..percpu  0000000000000008 espfix_stack
    ...
    0000000000019398 g       .data..percpu  0000000000000000 __per_cpu_end
After:
    0000000000000000  w    O .data..percpu  0000000000004000 irq_stack_union
    0000000000004000  w    O .data..percpu  0000000000005000 exception_stacks
    0000000000009000  w    O .data..percpu  0000000000000008 espfix_waddr
    0000000000009008  w    O .data..percpu  0000000000000008 espfix_stack
    ...
    0000000000013c80  w    O .data..percpu  0000000000000040 cyc2ns
    0000000000013cc0  w    O .data..percpu  00000000000022c0 cpu_tss
    0000000000015f80  w    O .data..percpu  0000000000000080 cpu_gdt  <<< HERE
    0000000000016000  w    O .data..percpu  0000000000000018 cpu_tlbstate
    ...
    0000000000018418 g       .data..percpu  0000000000000000 __per_cpu_end

Run-tested on a 144 CPU machine.

Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
CC: Ingo Molnar <mingo@xxxxxxxxxx>
CC: H. Peter Anvin <hpa@xxxxxxxxx>
CC: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
CC: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
CC: David Vrabel <david.vrabel@xxxxxxxxxx>
CC: Joerg Roedel <joro@xxxxxxxxxx>
CC: Gleb Natapov <gleb@xxxxxxxxxx>
CC: Paolo Bonzini <pbonzini@xxxxxxxxxx>
CC: kvm@xxxxxxxxxxxxxxx
CC: x86@xxxxxxxxxx
CC: linux-kernel@xxxxxxxxxxxxxxx
---
 arch/x86/entry/entry_32.S        |  2 +-
 arch/x86/include/asm/desc.h      | 16 +++++++++++-----
 arch/x86/kernel/cpu/common.c     | 10 ++++++++--
 arch/x86/kernel/cpu/perf_event.c |  2 +-
 arch/x86/kernel/head_32.S        |  4 ++--
 arch/x86/kernel/head_64.S        |  2 +-
 arch/x86/kernel/vmlinux.lds.S    |  2 +-
 arch/x86/tools/relocs.c          |  2 +-
 arch/x86/xen/enlighten.c         |  4 ++--
 9 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2909bf..bc6ae1c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -429,7 +429,7 @@ ldt_ss:
  * compensating for the offset by changing to the ESPFIX segment with
  * a base address that matches for the difference.
  */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
 	mov	%esp, %edx			/* load kernel esp */
 	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
 	mov	%dx, %ax			/* eax: new kernel esp */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73..76de300 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -39,15 +39,21 @@ extern gate_desc idt_table[];
 extern struct desc_ptr debug_idt_descr;
 extern gate_desc debug_idt_table[];
 
-struct gdt_page {
+struct cpu_gdt {
 	struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+}
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+__attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
+#else
+____cacheline_aligned;
+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
+#endif
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
-	return per_cpu(gdt_page, cpu).gdt;
+	return per_cpu(cpu_gdt, cpu).gdt;
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index de22ea7..6b90785 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#else
+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#endif
+{ .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	GDT_STACK_CANARY_INIT
 #endif
 } };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
 
 static int __init x86_mpx_setup(char *s)
 {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 66dd3fe9..41ebc94 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned int segment)
 		if (idx > GDT_ENTRIES)
 			return 0;
 
-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+		desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
 	}
 
 	return get_desc_base(desc);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0e2d96f..33c9d10 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -521,7 +521,7 @@ setup_once:
 	 * relocation.  Manually set base address in stack canary
 	 * segment descriptor.
 	 */
-	movl $gdt_page,%eax
+	movl $cpu_gdt,%eax
 	movl $stack_canary,%ecx
 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
 	shrl $16, %ecx
@@ -758,7 +758,7 @@ idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long gdt_page			/* Overwritten for secondary CPUs */
+	.long cpu_gdt			/* Overwritten for secondary CPUs */
 
 /*
  * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8..5a80e8c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt)
 early_gdt_descr:
 	.word	GDT_ENTRIES*8-1
 early_gdt_descr_base:
-	.quad	INIT_PER_CPU_VAR(gdt_page)
+	.quad	INIT_PER_CPU_VAR(cpu_gdt)
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf1..198e46b 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -348,7 +348,7 @@ SECTIONS
  * for the boot processor.
  */
 #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(cpu_gdt);
 INIT_PER_CPU(irq_stack_union);
 
 /*
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8..5a1da82 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -736,7 +736,7 @@ static void percpu_init(void)
  *
  * The "gold" linker incorrectly associates:
  *	init_per_cpu__irq_stack_union
- *	init_per_cpu__gdt_page
+ *	init_per_cpu__cpu_gdt
  */
 static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
 {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30d12af..baecfc6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/*
 	 * The only reliable way to retain the initial address of the
-	 * percpu gdt_page is to remember it here, so we can go and
+	 * percpu cpu_gdt is to remember it here, so we can go and
 	 * mark it RW later, when the initial percpu area is freed.
 	 */
-	xen_initial_gdt = &per_cpu(gdt_page, 0);
+	xen_initial_gdt = &per_cpu(cpu_gdt, 0);
 
 	xen_smp_init();
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html