[ The patch is based on my boot-time switching patchset and would not apply directly to current upstream, but I would appreciate early feedback. ] This patch addresses shortcoming in current boot process on machines that supports 5-level paging. If bootloader enables 64-bit mode with 4-level paging, we need to switch over to 5-level paging. The switching requires disabling paging. It works fine if kernel itself is loaded below 4G. If bootloader put the kernel above 4G (not sure if anybody does this), we would loose control as soon as paging is disabled as code becomes unreachable. This patch implements trampoline in lower memory to handle this situation. Apart from trampoline itself we also need place to store top level page table in lower memory as we don't have a way to load 64-bit value into CR3 from 32-bit mode. We only really need 8-bytes there as we only use the very first entry of the page table. but we allocate whole page anyway. We cannot have the code in the same because, there's hazard that a CPU would read page table speculatively and get confused seeing garbage. We only need the memory for very short time, until main kernel image setup its own page tables. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> --- arch/x86/boot/compressed/head_64.S | 83 ++++++++++++++++++++++-------------- arch/x86/boot/compressed/pagetable.c | 45 +++++++++++++++++++ arch/x86/boot/compressed/pagetable.h | 16 +++++++ 3 files changed, 111 insertions(+), 33 deletions(-) create mode 100644 arch/x86/boot/compressed/pagetable.h diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index cefe4958fda9..7e806a55ea1c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -32,6 +32,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include "pagetable.h" /* * Locally defined symbols should be marked hidden: @@ -288,8 +289,22 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp #ifdef CONFIG_X86_5LEVEL - /* Preserve RBX across CPUID */ - movq %rbx, %r8 +/* + * We need trampoline in lower memory switch from 4- to 5-level paging for + * cases when bootloader put kernel above 4G, but didn't enable 5-level paging + * for us. + * + * We also have to have top page table in lower memory as we don't have a way + * to load 64-bit value into CR3 from 32-bit mode. We only need 8-bytes there + * as we only use the very first entry of the page table, but we allocate whole + * page anyway. We cannot have the code in the same because, there's hazard + * that a CPU would read page table speculatively and get confused seeing + * garbage. + */ + + /* Preserve RBX and ESP */ + movq %rbx, %r15 + movq %rsp, %r14 /* Check if leaf 7 is supported */ xorl %eax, %eax @@ -307,9 +322,6 @@ ENTRY(startup_64) andl $(1 << 16), %ecx jz lvl5 - /* Restore RBX */ - movq %r8, %rbx - /* Check if 5-level paging has already been enabled */ movq %cr4, %rax testl $X86_CR4_LA57, %eax @@ -323,34 +335,34 @@ ENTRY(startup_64) * long mode would trigger #GP. So we need to switch off long mode * first. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. + * We use trampoline in lower memory to handle situation when + * bootloader put the kernel image above 4G. * * The first step is go into compatibility mode. */ - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq + /* + * Find sitable place for trampoline. + * The address will be stored in RBX. + */ + call place_trampoline + movq %rax, %rbx /* - * Setup current CR3 as the first and only entry in a new top level - * page table. + * Load address of lvl5 into RDI. + * It will be used to return address from trampoline. */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) + leaq lvl5(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq LVL5_TRAMPOLINE_CODE_OFF(%rbx), %rax pushq %rax lretq lvl5: - /* Restore RBX */ - movq %r8, %rbx + /* Restore RBX and ESP */ + movq %r15, %rbx + movq %r14, %rsp #endif /* Zero EFLAGS */ @@ -488,21 +500,24 @@ relocated: */ jmp *%rax - .code32 #ifdef CONFIG_X86_5LEVEL -compatible_mode: + .code32 +ENTRY(lvl5_trampoline_src) /* Setup data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss + /* Setup new stack at the end of trampoline memory */ + leal LVL5_TRAMPOLINE_STACK_END (%ebx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax + leal (%ebx), %eax movl %eax, %cr3 /* Enable PAE and LA57 mode */ @@ -510,23 +525,29 @@ compatible_mode: orl $(X86_CR4_PAE | X86_CR4_LA57), %eax movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of lvl5_enabled once we are in trampoline */ + leal lvl5_enabled - lvl5_trampoline_src + LVL5_TRAMPOLINE_CODE_OFF (%ebx), %eax /* Prepare stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax /* Enable paging back */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret + + .code64 +lvl5_enabled: + /* Return from trampoline */ + jmp *%rdi + + /* Bound size of trampoline code */ + .org lvl5_trampoline_src + LVL5_TRAMPOLINE_CODE_SIZE #endif + .code32 no_longmode: /* This isn't an x86-64 CPU so hang */ 1: @@ -584,7 +605,3 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL -lvl5_pgtable: - .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index f1aa43854bed..4f9d7bfca94b 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -23,6 +23,8 @@ #undef CONFIG_AMD_MEM_ENCRYPT #include "misc.h" +#include "pagetable.h" +#include "../string.h" /* These actually do the work of building the kernel identity maps. */ #include <asm/init.h> @@ -149,3 +151,46 @@ void finalize_identity_maps(void) { write_cr3(top_level_pgt); } + +#ifdef CONFIG_X86_5LEVEL + +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +asmlinkage __visible unsigned long *place_trampoline() +{ + unsigned long bios_start, ebda_start, trampoline_start, *trampoline; + + /* Based on reserve_bios_regions() */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + /* Place trampoline below end of low memory, aligned to 4k */ + trampoline_start = bios_start - LVL5_TRAMPOLINE_SIZE; + trampoline_start = round_down(trampoline_start, PAGE_SIZE); + + trampoline = (unsigned long *)trampoline_start; + + /* Clear trampoline memory first */ + memset(trampoline, 0, LVL5_TRAMPOLINE_SIZE); + + /* Copy trampoline code in place */ + memcpy(trampoline + LVL5_TRAMPOLINE_CODE_OFF / sizeof(unsigned long), + &lvl5_trampoline_src, LVL5_TRAMPOLINE_CODE_SIZE); + + /* + * Setup current CR3 as the first and the only entry in a new top level + * page table. + */ + trampoline[0] = __read_cr3() + _PAGE_TABLE_NOENC; + + return trampoline; +} +#endif diff --git a/arch/x86/boot/compressed/pagetable.h b/arch/x86/boot/compressed/pagetable.h new file mode 100644 index 000000000000..22b05f1e22ea --- /dev/null +++ b/arch/x86/boot/compressed/pagetable.h @@ -0,0 +1,16 @@ +#ifndef BOOT_COMPRESSED_PAGETABLE_H +#define BOOT_COMPRESSED_PAGETABLE_H + +#define LVL5_TRAMPOLINE_SIZE (2 * PAGE_SIZE) + +#define LVL5_TRAMPOLINE_CODE_OFF PAGE_SIZE +#define LVL5_TRAMPOLINE_CODE_SIZE 0x40 + +#define LVL5_TRAMPOLINE_STACK_END LVL5_TRAMPOLINE_SIZE + +#ifndef __ASSEMBLER__ + +extern void (*lvl5_trampoline_src)(void *return_ptr); + +#endif /* __ASSEMBLER__ */ +#endif /* BOOT_COMPRESSED_PAGETABLE_H */ -- 2.14.2 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>