On 7 January 2015 at 12:06, Leif Lindholm <leif.lindholm@xxxxxxxxxx> wrote: > On Mon, Dec 22, 2014 at 10:59:02AM +0000, Ard Biesheuvel wrote: >> In order to support kexec, the kernel needs to be able to deal with the >> state of the UEFI firmware after SetVirtualAddressMap() has been called. >> To avoid having separate code paths for non-kexec and kexec, let's move >> the call to SetVirtualAddressMap() to the stub: this will guarantee us >> that it will only be called once (since the stub is not executed during >> kexec), and ensures that the UEFI state is identical between kexec and >> normal boot. >> >> This implies that the layout of the virtual mapping needs to be created >> by the stub as well. All regions are rounded up to a naturally aligned >> multiple of 64 KB (for compatibility with 64k pages kernels) and recorded >> in the UEFI memory map. The kernel proper reads those values and installs >> the mappings in a dedicated set of page tables that are swapped in during >> UEFI Runtime Services calls. >> >> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> >> --- >> arch/arm64/include/asm/efi.h | 20 +++- >> arch/arm64/kernel/efi.c | 223 ++++++++++++++++++++----------------- >> arch/arm64/kernel/setup.c | 1 + >> drivers/firmware/efi/libstub/fdt.c | 137 ++++++++++++++++++++++- >> 4 files changed, 270 insertions(+), 111 deletions(-) >> >> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h >> index 71291253114f..6cc668a378c5 100644 >> --- a/arch/arm64/include/asm/efi.h >> +++ b/arch/arm64/include/asm/efi.h >> @@ -7,28 +7,36 @@ >> #ifdef CONFIG_EFI >> extern void efi_init(void); >> extern void efi_idmap_init(void); >> +extern void efi_virtmap_init(void); >> #else >> #define efi_init() >> #define efi_idmap_init() >> +#define efi_virtmap_init >> #endif >> >> #define efi_call_virt(f, ...) \ >> ({ \ >> - efi_##f##_t *__f = efi.systab->runtime->f; \ >> + efi_##f##_t *__f; \ >> efi_status_t __s; \ >> \ >> - kernel_neon_begin(); \ >> + kernel_neon_begin(); /* disables preemption */ \ > > Nitpick: adding comment to otherwise untouched source line. > >> + efi_virtmap_load(); \ >> + __f = efi.systab->runtime->f; \ >> __s = __f(__VA_ARGS__); \ >> + efi_virtmap_unload(); \ >> kernel_neon_end(); \ >> __s; \ >> }) >> >> #define __efi_call_virt(f, ...) \ >> ({ \ >> - efi_##f##_t *__f = efi.systab->runtime->f; \ >> + efi_##f##_t *__f; \ >> \ >> - kernel_neon_begin(); \ >> + kernel_neon_begin(); /* disables preemption */ \ > > Same nitpick. > Is there anything wrong with that? Would you prefer the comment to be on a separate line? >> + efi_virtmap_load(); \ >> + __f = efi.systab->runtime->f; \ >> __f(__VA_ARGS__); \ >> + efi_virtmap_unload(); \ >> kernel_neon_end(); \ >> }) >> >> @@ -45,5 +53,9 @@ extern void efi_idmap_init(void); >> #define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) >> >> #define EFI_ALLOC_ALIGN SZ_64K >> +#define EFI_VIRTMAP EFI_ARCH_1 >> + >> +void efi_virtmap_load(void); >> +void efi_virtmap_unload(void); >> >> #endif /* _ASM_EFI_H */ >> diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c >> index 6fac253bc783..2ebe67ffb629 100644 >> --- a/arch/arm64/kernel/efi.c >> +++ b/arch/arm64/kernel/efi.c >> @@ -11,25 +11,30 @@ >> * >> */ >> >> +#include <linux/atomic.h> >> #include <linux/dmi.h> >> #include <linux/efi.h> >> #include <linux/export.h> >> #include <linux/memblock.h> >> +#include <linux/mm_types.h> >> #include <linux/bootmem.h> >> #include <linux/of.h> >> #include <linux/of_fdt.h> >> +#include <linux/rbtree.h> >> +#include <linux/rwsem.h> >> #include <linux/sched.h> >> #include <linux/slab.h> >> +#include <linux/spinlock.h> >> >> #include <asm/cacheflush.h> >> #include <asm/efi.h> >> #include <asm/tlbflush.h> >> #include <asm/mmu_context.h> >> +#include <asm/mmu.h> >> +#include <asm/pgtable.h> >> >> struct efi_memory_map memmap; >> >> -static efi_runtime_services_t *runtime; >> - >> static u64 efi_system_table; >> >> static int uefi_debug __initdata; >> @@ -69,9 +74,33 @@ static void __init efi_setup_idmap(void) >> } >> } >> >> +/* >> + * Translate a EFI virtual address into a physical address: this is necessary, >> + * as some data members of the EFI system table are virtually remapped after >> + * SetVirtualAddressMap() has been called. >> + */ >> +static phys_addr_t efi_to_phys(unsigned long addr) >> +{ >> + efi_memory_desc_t *md; >> + >> + for_each_efi_memory_desc(&memmap, md) { >> + if (!(md->attribute & EFI_MEMORY_RUNTIME)) >> + continue; >> + if (md->virt_addr == 0) >> + /* no virtual mapping has been installed by the stub */ >> + break; >> + if (md->virt_addr <= addr && >> + (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT)) >> + return md->phys_addr + addr - md->virt_addr; >> + } >> + return addr; >> +} >> + >> static int __init uefi_init(void) >> { >> efi_char16_t *c16; >> + void *config_tables; >> + u64 table_size; >> char vendor[100] = "unknown"; >> int i, retval; >> >> @@ -99,7 +128,7 @@ static int __init uefi_init(void) >> efi.systab->hdr.revision & 0xffff); >> >> /* Show what we know for posterity */ >> - c16 = early_memremap(efi.systab->fw_vendor, >> + c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), >> sizeof(vendor)); >> if (c16) { >> for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) >> @@ -112,8 +141,14 @@ static int __init uefi_init(void) >> efi.systab->hdr.revision >> 16, >> efi.systab->hdr.revision & 0xffff, vendor); >> >> - retval = efi_config_init(NULL); >> + table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables; >> + config_tables = early_memremap(efi_to_phys(efi.systab->tables), >> + table_size); >> + >> + retval = efi_config_parse_tables(config_tables, >> + efi.systab->nr_tables, NULL); >> >> + early_memunmap(config_tables, table_size); >> out: >> early_memunmap(efi.systab, sizeof(efi_system_table_t)); >> return retval; >> @@ -328,51 +363,9 @@ void __init efi_idmap_init(void) >> efi_setup_idmap(); >> } >> >> -static int __init remap_region(efi_memory_desc_t *md, void **new) >> -{ >> - u64 paddr, vaddr, npages, size; >> - >> - paddr = md->phys_addr; >> - npages = md->num_pages; >> - memrange_efi_to_native(&paddr, &npages); >> - size = npages << PAGE_SHIFT; >> - >> - if (is_normal_ram(md)) >> - vaddr = (__force u64)ioremap_cache(paddr, size); >> - else >> - vaddr = (__force u64)ioremap(paddr, size); >> - >> - if (!vaddr) { >> - pr_err("Unable to remap 0x%llx pages @ %p\n", >> - npages, (void *)paddr); >> - return 0; >> - } >> - >> - /* adjust for any rounding when EFI and system pagesize differs */ >> - md->virt_addr = vaddr + (md->phys_addr - paddr); >> - >> - if (uefi_debug) >> - pr_info(" EFI remap 0x%012llx => %p\n", >> - md->phys_addr, (void *)md->virt_addr); >> - >> - memcpy(*new, md, memmap.desc_size); >> - *new += memmap.desc_size; >> - >> - return 1; >> -} >> - >> -/* >> - * Switch UEFI from an identity map to a kernel virtual map >> - */ > > No function description at all? Seems I was a bit lazy there. > Arguably this function could change name now as well, since UEFI will > already be in virtual mode. arm64_enable_runtime_map()? > OK >> static int __init arm64_enter_virtual_mode(void) >> { >> - efi_memory_desc_t *md; >> - phys_addr_t virtmap_phys; >> - void *virtmap, *virt_md; >> - efi_status_t status; >> u64 mapsize; >> - int count = 0; >> - unsigned long flags; >> >> if (!efi_enabled(EFI_BOOT)) { >> pr_info("EFI services will not be available.\n"); >> @@ -395,79 +388,28 @@ static int __init arm64_enter_virtual_mode(void) >> >> efi.memmap = &memmap; >> >> - /* Map the runtime regions */ >> - virtmap = kmalloc(mapsize, GFP_KERNEL); >> - if (!virtmap) { >> - pr_err("Failed to allocate EFI virtual memmap\n"); >> - return -1; >> - } >> - virtmap_phys = virt_to_phys(virtmap); >> - virt_md = virtmap; >> - >> - for_each_efi_memory_desc(&memmap, md) { >> - if (!(md->attribute & EFI_MEMORY_RUNTIME)) >> - continue; >> - if (!remap_region(md, &virt_md)) >> - goto err_unmap; >> - ++count; >> - } >> - >> - efi.systab = (__force void *)efi_lookup_mapped_addr(efi_system_table); >> + efi.systab = (__force void *)ioremap_cache(efi_system_table, >> + sizeof(efi_system_table_t)); >> if (!efi.systab) { >> - /* >> - * If we have no virtual mapping for the System Table at this >> - * point, the memory map doesn't cover the physical offset where >> - * it resides. This means the System Table will be inaccessible >> - * to Runtime Services themselves once the virtual mapping is >> - * installed. >> - */ >> - pr_err("Failed to remap EFI System Table -- buggy firmware?\n"); >> - goto err_unmap; >> + pr_err("Failed to remap EFI System Table\n"); >> + return -1; >> } >> set_bit(EFI_SYSTEM_TABLES, &efi.flags); >> >> - local_irq_save(flags); >> - cpu_switch_mm(idmap_pg_dir, &init_mm); >> - >> - /* Call SetVirtualAddressMap with the physical address of the map */ >> - runtime = efi.systab->runtime; >> - efi.set_virtual_address_map = runtime->set_virtual_address_map; >> - >> - status = efi.set_virtual_address_map(count * memmap.desc_size, >> - memmap.desc_size, >> - memmap.desc_version, >> - (efi_memory_desc_t *)virtmap_phys); >> - cpu_set_reserved_ttbr0(); >> - flush_tlb_all(); >> - local_irq_restore(flags); >> - >> - kfree(virtmap); >> - >> free_boot_services(); >> >> - if (status != EFI_SUCCESS) { >> - pr_err("Failed to set EFI virtual address map! [%lx]\n", >> - status); >> + if (!efi_enabled(EFI_VIRTMAP)) { >> + pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); >> return -1; >> } >> >> /* Set up runtime services function pointers */ >> - runtime = efi.systab->runtime; >> efi_native_runtime_setup(); >> set_bit(EFI_RUNTIME_SERVICES, &efi.flags); >> >> efi.runtime_version = efi.systab->hdr.revision; >> >> return 0; >> - >> -err_unmap: >> - /* unmap all mappings that succeeded: there are 'count' of those */ >> - for (virt_md = virtmap; count--; virt_md += memmap.desc_size) { >> - md = virt_md; >> - iounmap((__force void __iomem *)md->virt_addr); >> - } >> - kfree(virtmap); >> - return -1; >> } >> early_initcall(arm64_enter_virtual_mode); >> >> @@ -484,3 +426,78 @@ static int __init arm64_dmi_init(void) >> return 0; >> } >> core_initcall(arm64_dmi_init); >> + >> +static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; >> + >> +static struct mm_struct efi_mm = { >> + .mm_rb = RB_ROOT, >> + .pgd = efi_pgd, >> + .mm_users = ATOMIC_INIT(2), >> + .mm_count = ATOMIC_INIT(1), >> + .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), >> + .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), >> + .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), >> + INIT_MM_CONTEXT(efi_mm) >> +}; >> + >> +static void efi_set_pgd(struct mm_struct *mm) >> +{ >> + cpu_switch_mm(mm->pgd, mm); >> + flush_tlb_all(); >> + if (icache_is_aivivt()) >> + __flush_icache_all(); >> +} >> + >> +void efi_virtmap_load(void) >> +{ >> + WARN_ON(preemptible()); >> + efi_set_pgd(&efi_mm); >> +} >> + >> +void efi_virtmap_unload(void) >> +{ >> + efi_set_pgd(current->active_mm); >> +} >> + >> +void __init efi_virtmap_init(void) >> +{ >> + efi_memory_desc_t *md; >> + >> + if (!efi_enabled(EFI_BOOT)) >> + return; >> + >> + for_each_efi_memory_desc(&memmap, md) { >> + u64 paddr, npages, size; >> + pgprot_t prot; >> + >> + if (!(md->attribute & EFI_MEMORY_RUNTIME)) >> + continue; >> + if (WARN(md->virt_addr == 0, >> + "UEFI virtual mapping incomplete or missing -- no entry found for 0x%llx\n", >> + md->phys_addr)) >> + return; >> + >> + paddr = md->phys_addr; >> + npages = md->num_pages; >> + memrange_efi_to_native(&paddr, &npages); >> + size = npages << PAGE_SHIFT; >> + >> + pr_info(" EFI remap 0x%012llx => %p\n", >> + md->phys_addr, (void *)md->virt_addr); >> + >> + /* >> + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be >> + * executable, everything else can be mapped with the XN bits >> + * set. >> + */ >> + if (!is_normal_ram(md)) >> + prot = __pgprot(PROT_DEVICE_nGnRE); >> + else if (md->type == EFI_RUNTIME_SERVICES_CODE) >> + prot = PAGE_KERNEL_EXEC; >> + else >> + prot = PAGE_KERNEL; >> + >> + create_pgd_mapping(&efi_mm, paddr, md->virt_addr, size, prot); >> + } >> + set_bit(EFI_VIRTMAP, &efi.flags); >> +} >> diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c >> index b80991166754..d8390f507da0 100644 >> --- a/arch/arm64/kernel/setup.c >> +++ b/arch/arm64/kernel/setup.c >> @@ -402,6 +402,7 @@ void __init setup_arch(char **cmdline_p) >> request_standard_resources(); >> >> efi_idmap_init(); >> + efi_virtmap_init(); > > Could these two be merged together into one function? > Say efi_memmap_init()? > Well, I decided to do it like this because efi_idmap_init() gets removed in its entirety (including this invocation) in a subsequent patch. >> >> unflatten_device_tree(); >> >> diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c >> index c846a9608cbd..76bc8abf41d1 100644 >> --- a/drivers/firmware/efi/libstub/fdt.c >> +++ b/drivers/firmware/efi/libstub/fdt.c >> @@ -167,6 +167,94 @@ fdt_set_fail: >> #define EFI_FDT_ALIGN EFI_PAGE_SIZE >> #endif >> >> +static efi_status_t get_memory_map(efi_system_table_t *sys_table_arg, >> + efi_memory_desc_t **map, >> + unsigned long *map_size, >> + unsigned long *desc_size, >> + u32 *desc_ver, unsigned long *key_ptr) >> +{ >> + efi_status_t status; >> + >> + /* >> + * Call get_memory_map() with 0 size to retrieve the size of the >> + * required allocation. >> + */ >> + *map_size = 0; >> + status = efi_call_early(get_memory_map, map_size, NULL, >> + key_ptr, desc_size, desc_ver); >> + if (status != EFI_BUFFER_TOO_SMALL) >> + return EFI_LOAD_ERROR; >> + >> + /* >> + * Add an additional efi_memory_desc_t to map_size because we're doing >> + * an allocation which may be in a new descriptor region. Then double it >> + * to give us some scratch space to prepare the input virtmap to give >> + * to SetVirtualAddressMap(). Note that this is EFI_LOADER_DATA memory, >> + * and the kernel memblock_reserve()'s only the size of the actual >> + * memory map, so the scratch space is freed again automatically. >> + */ >> + *map_size += *desc_size; >> + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, >> + *map_size * 2, (void **)map); >> + if (status != EFI_SUCCESS) >> + return status; >> + >> + status = efi_call_early(get_memory_map, map_size, *map, >> + key_ptr, desc_size, desc_ver); >> + if (status != EFI_SUCCESS) >> + efi_call_early(free_pool, *map); >> + return status; >> +} >> + >> +/* >> + * This is the base address at which to start allocating virtual memory ranges >> + * for UEFI Runtime Services. This is a userland range so that we can use any >> + * allocation we choose, and eliminate the risk of a conflict after kexec. >> + */ >> +#define EFI_RT_VIRTUAL_BASE 0x40000000 >> + >> +static void update_memory_map(efi_memory_desc_t *memory_map, >> + unsigned long map_size, unsigned long desc_size, >> + int *count) >> +{ >> + u64 efi_virt_base = EFI_RT_VIRTUAL_BASE; >> + efi_memory_desc_t *out = (void *)memory_map + map_size; >> + int l; >> + >> + for (l = 0; l < map_size; l += desc_size) { >> + efi_memory_desc_t *in = (void *)memory_map + l; >> + u64 paddr, size; >> + >> + if (!(in->attribute & EFI_MEMORY_RUNTIME)) >> + continue; >> + >> + /* >> + * Make the mapping compatible with 64k pages: this allows >> + * a 4k page size kernel to kexec a 64k page size kernel and >> + * vice versa. >> + */ >> + paddr = round_down(in->phys_addr, SZ_64K); >> + size = round_up(in->num_pages * EFI_PAGE_SIZE + >> + in->phys_addr - paddr, SZ_64K); >> + >> + /* >> + * Avoid wasting memory on PTEs by choosing a virtual base that >> + * is compatible with section mappings if this region has the >> + * appropriate size and physical alignment. (Sections are 2 MB >> + * on 4k granule kernels) >> + */ >> + if (IS_ALIGNED(in->phys_addr, SZ_2M) && size >= SZ_2M) >> + efi_virt_base = round_up(efi_virt_base, SZ_2M); >> + >> + in->virt_addr = efi_virt_base + in->phys_addr - paddr; >> + efi_virt_base += size; >> + >> + memcpy(out, in, desc_size); >> + out = (void *)out + desc_size; >> + ++*count; >> + } >> +} >> + >> /* >> * Allocate memory for a new FDT, then add EFI, commandline, and >> * initrd related fields to the FDT. This routine increases the >> @@ -196,6 +284,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, >> efi_memory_desc_t *memory_map; >> unsigned long new_fdt_size; >> efi_status_t status; >> + int runtime_entry_count = 0; >> >> /* >> * Estimate size of new FDT, and allocate memory for it. We >> @@ -216,8 +305,8 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, >> * we can get the memory map key needed for >> * exit_boot_services(). >> */ >> - status = efi_get_memory_map(sys_table, &memory_map, &map_size, >> - &desc_size, &desc_ver, &mmap_key); >> + status = get_memory_map(sys_table, &memory_map, &map_size, >> + &desc_size, &desc_ver, &mmap_key); >> if (status != EFI_SUCCESS) >> goto fail_free_new_fdt; >> >> @@ -248,12 +337,52 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, >> } >> } >> >> + /* >> + * Update the memory map with virtual addresses. The function will also >> + * populate the spare second half of the memory_map allocation with >> + * copies of just the EFI_MEMORY_RUNTIME entries so that we can pass it >> + * straight into SetVirtualAddressMap() >> + */ >> + update_memory_map(memory_map, map_size, desc_size, >> + &runtime_entry_count); >> + >> + pr_efi(sys_table, >> + "Exiting boot services and installing virtual address map...\n"); >> + >> /* Now we are ready to exit_boot_services.*/ >> status = sys_table->boottime->exit_boot_services(handle, mmap_key); >> >> + if (status == EFI_SUCCESS) { >> + efi_set_virtual_address_map_t *svam; >> >> - if (status == EFI_SUCCESS) >> - return status; >> + /* Install the new virtual address map */ >> + svam = sys_table->runtime->set_virtual_address_map; >> + status = svam(runtime_entry_count * desc_size, desc_size, >> + desc_ver, (void *)memory_map + map_size); >> + >> + /* >> + * We are beyond the point of no return here, so if the call to >> + * SetVirtualAddressMap() failed, we need to signal that to the >> + * incoming kernel but proceed normally otherwise. >> + */ >> + if (status != EFI_SUCCESS) { >> + int l; >> + >> + /* >> + * Set the virtual address field of all >> + * EFI_MEMORY_RUNTIME entries to 0. This will signal >> + * the incoming kernel that no virtual translation has >> + * been installed. >> + */ >> + for (l = 0; l < map_size; l += desc_size) { >> + efi_memory_desc_t *p = (void *)memory_map + l; >> + >> + if (p->attribute & EFI_MEMORY_RUNTIME) >> + p->virt_addr = 0; >> + } >> + } >> + return EFI_SUCCESS; >> + } >> >> pr_efi_err(sys_table, "Exit boot services failed.\n"); >> >> -- >> 1.8.3.2 > > Apart from this, and other comments in the thread, looks good. > > / > Leif > -- To unsubscribe from this list: send the line "unsubscribe linux-efi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html