In preparation for updating the EFI stub boot flow to avoid the bare metal decompressor code altogether, implement the support code for switching between 4 and 5 levels of paging before jumping to the kernel proper. Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx> --- drivers/firmware/efi/libstub/efi-stub-helper.c | 4 + drivers/firmware/efi/libstub/x86-stub.c | 145 ++++++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 1e0203d74691ffcc..fc5f3b4c45e91401 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -16,6 +16,8 @@ #include "efistub.h" +extern bool efi_no5lvl; + bool efi_nochunk; bool efi_nokaslr = !IS_ENABLED(CONFIG_RANDOMIZE_BASE); bool efi_novamap; @@ -73,6 +75,8 @@ efi_status_t efi_parse_options(char const *cmdline) efi_loglevel = CONSOLE_LOGLEVEL_QUIET; } else if (!strcmp(param, "noinitrd")) { efi_noinitrd = true; + } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) { + efi_no5lvl = true; } else if (!strcmp(param, "efi") && val) { efi_nochunk = parse_option_str(val, "nochunk"); efi_novamap |= parse_option_str(val, "novamap"); diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index e136c94037dda8d3..7b8717cbb96a1246 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -760,6 +760,139 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) return EFI_SUCCESS; } +#ifdef CONFIG_X86_64 +bool efi_no5lvl; + +static const struct desc_struct gdt[] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +static void (*la57_toggle)(void *cr3, void *gdt); + +static void __naked tmpl_toggle(void *cr3, void *gdt) +{ + /* + * This is template code that will be copied into a 32-bit addressable + * buffer, allowing us to drop to 32-bit mode with paging disabled, + * which is required to be able to toggle the CR4.LA57 bit. + * + * The first MOVB instruction is only there to capture the size of the + * sequence, and implicitly, the offset to the LJMP's immediate, which + * will be populated with the correct absolute address after copying. + */ + asm("0: movb $(4f - .), %%al \n\t" + " lgdt (%%rsi) \n\t" + " movw %[ds], %%ax \n\t" + " movw %%ax, %%ds \n\t" + " movw %%ax, %%ss \n\t" + " leaq 2f(%%rip), %%rax \n\t" + " pushq %[cs32] \n\t" + " pushq %%rax \n\t" + " lretq \n\t" + "1: retq \n\t" + " .code32 \n\t" + "2: movl %%cr0, %%eax \n\t" + " btrl %[pg], %%eax \n\t" + " movl %%eax, %%cr0 \n\t" + " jmp 3f \n\t" + "3: movl %%cr4, %%ecx \n\t" + " btcl %[la57], %%ecx \n\t" + " movl %%ecx, %%cr4 \n\t" + " movl %%edi, %%cr3 \n\t" + " btsl %[pg], %%eax \n\t" + " movl %%eax, %%cr0 \n\t" + " ljmpl %[cs], $(1b - 0b) \n\t" + "4: .code64" + : + : [cs32] "i"(__KERNEL32_CS), + [cs] "i"(__KERNEL_CS), + [ds] "i"(__KERNEL_DS), + [pg] "i"(X86_CR0_PG_BIT), + [la57] "i"(X86_CR4_LA57_BIT)); +} + +/* + * Enabling (or disabling) 5 level paging is tricky, because it can only be + * done from 32-bit mode with paging disabled. This means not only that the + * code itself must be running from 32-bit addressable physical memory, but + * also that the root page table must be 32-bit addressable, as we cannot + * program a 64-bit value into CR3 when running in 32-bit mode. + */ +static efi_status_t efi_setup_5level_paging(void) +{ + const u8 tmpl_size = ((u8 *)tmpl_toggle)[1]; + efi_status_t status; + u8 *la57_code; + + if (!efi_is_64bit()) + return EFI_SUCCESS; + + /* check for 5 level paging support */ + if (native_cpuid_eax(0) < 7 || + !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return EFI_SUCCESS; + + /* allocate some 32-bit addressable memory for code and a page table */ + status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code, + U32_MAX); + if (status != EFI_SUCCESS) + return status; + + la57_toggle = memcpy(la57_code, tmpl_toggle, tmpl_size); + memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size); + + /* + * To avoid having to allocate a 32-bit addressable stack, we use a + * ljmp to switch back to long mode. However, this takes an absolute + * address, so we have to poke it in at runtime. The dummy MOVB + * instruction at the beginning can be used to locate the immediate. + */ + *(u32 *)&la57_code[tmpl_size - 6] += (unsigned long)la57_code; + + adjust_memory_range_protection((unsigned long)la57_code, PAGE_SIZE); + + return EFI_SUCCESS; +} + +static void efi_5level_switch(void) +{ + bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl; + bool have_la57 = native_read_cr4() & X86_CR4_LA57; + bool need_toggle = want_la57 ^ have_la57; + u64 *pgt = (void *)la57_toggle + PAGE_SIZE; + u64 *cr3 = (u64 *)__native_read_cr3(); + struct desc_ptr desc; + u64 *new_cr3; + + if (!la57_toggle || !need_toggle) + return; + + if (!have_la57) { + /* + * We are going to enable 5 level paging, so we need to + * allocate a root level page from the 32-bit addressable + * physical region, and plug the existing hierarchy into it. + */ + new_cr3 = memset(pgt, 0, PAGE_SIZE); + new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC; + } else { + // take the new root table pointer from the current entry #0 + new_cr3 = (u64 *)(cr3[0] & PAGE_MASK); + + // copy the new root level table if it is not 32-bit addressable + if ((u64)new_cr3 > U32_MAX) + new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE); + } + + desc.size = sizeof(gdt) - 1; + desc.address = (u64)gdt; + + la57_toggle(new_cr3, &desc); +} +#endif + /* * On success, we return the address of startup_32, which has potentially been * relocated by efi_relocate_kernel. @@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle, (get_efi_config_table(ACPI_20_TABLE_GUID) ?: get_efi_config_table(ACPI_TABLE_GUID)); +#ifdef CONFIG_X86_64 + status = efi_setup_5level_paging(); + if (status != EFI_SUCCESS) { + efi_err("efi_setup_5level_paging() failed!\n"); + goto fail; + } +#endif + /* * If the kernel isn't already loaded at a suitable address, * relocate it. @@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle, goto fail; } +#ifdef CONFIG_X86_64 + efi_5level_switch(); +#endif + return bzimage_addr; fail: efi_err("efi_main() failed!\n"); -- 2.39.2