From: Ashish Kalra <ashish.kalra@xxxxxxx> SNP guests allocate shared buffers to perform I/O. It is done by allocating pages normally from the buddy allocator and converting them to shared with set_memory_decrypted(). The second, kexec-ed, kernel has no idea what memory is converted this way. It only sees E820_TYPE_RAM. Accessing shared memory via private mapping will cause unrecoverable RMP page-faults. On kexec walk direct mapping and convert all shared memory back to private. It makes all RAM private again and second kernel may use it normally. Additionally for SNP guests convert all bss decrypted section pages back to private. The conversion occurs in two steps: stopping new conversions and unsharing all memory. In the case of normal kexec, the stopping of conversions takes place while scheduling is still functioning. This allows for waiting until any ongoing conversions are finished. The second step is carried out when all CPUs except one are inactive and interrupts are disabled. This prevents any conflicts with code that may access shared memory. Reviewed-by: Tom Lendacky <thomas.lendacky@xxxxxxx> Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx> --- arch/x86/coco/sev/core.c | 148 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/sev.h | 4 + arch/x86/mm/mem_encrypt_amd.c | 2 + 3 files changed, 154 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 082d61d85dfc..0c90a8a74a88 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1010,6 +1010,154 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end) set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } +static void set_pte_enc(pte_t *kpte, int level, void *va) +{ + unsigned long pfn; + pgprot_t new_prot; + + prep_set_clr_pte_enc(kpte, level, 1, va, &pfn, NULL, NULL, &new_prot); + set_pte_enc_mask(kpte, pfn, new_prot); +} + +static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int level) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + int cpu; + + /* + * Ensure that all the per-cpu GHCBs are made private + * at the end of unshared loop so that we continue to use the + * optimized GHCB protocol and not force the switch to + * MSR protocol till the very end. + */ + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + ghcb = &data->ghcb_page; + /* Check for GHCB for being part of a PMD range */ + if ((unsigned long)ghcb >= addr && + (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) + return true; + } + + set_pte_enc(pte, level, (void *)addr); + snp_set_memory_private(addr, pages); + + return true; +} + +static void unshare_all_bss_decrypted_memory(void) +{ + unsigned long vaddr, vaddr_end; + unsigned int level; + unsigned int npages; + pte_t *pte; + + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__start_bss_decrypted_unused; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { + pte = lookup_address(vaddr, &level); + if (!pte || !pte_decrypted(*pte) || pte_none(*pte)) + continue; + + set_pte_enc(pte, level, (void *)vaddr); + } + vaddr = (unsigned long)__start_bss_decrypted; + snp_set_memory_private(vaddr, npages); +} + +static void unshare_all_memory(void) +{ + unsigned long addr, end; + + /* + * Walk direct mapping and convert all shared memory back to private. + */ + + addr = PAGE_OFFSET; + end = PAGE_OFFSET + get_max_mapped(); + + while (addr < end) { + unsigned long size; + unsigned int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + size = page_level_size(level); + + if (pte && pte_decrypted(*pte) && !pte_none(*pte)) { + int pages = size / PAGE_SIZE; + + if (!make_pte_private(pte, addr, pages, level)) { + pr_err("Failed to unshare range %#lx-%#lx\n", + addr, addr + size); + } + } + addr += size; + } + + unshare_all_bss_decrypted_memory(); + + __flush_tlb_all(); + +} + +/* Stop new private<->shared conversions */ +void snp_kexec_begin(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + /* + * Crash kernel reaches here with interrupts disabled: can't wait for + * conversions to finish. + * + * If race happened, just report and proceed. + */ + if (!set_memory_enc_stop_conversion()) + pr_warn("Failed to stop shared<->private conversions\n"); +} + +/* Walk direct mapping and convert all shared memory back to private */ +void snp_kexec_finish(void) +{ + struct sev_es_runtime_data *data; + unsigned int level, cpu; + unsigned long size; + struct ghcb *ghcb; + pte_t *pte; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + unshare_all_memory(); + + /* + * Switch to using the MSR protocol to change per-cpu + * GHCBs to private. + * All the per-cpu GHCBs have been switched back to private, + * so can't do any more GHCB calls to the hypervisor beyond + * this point till the kexec kernel starts running. + */ + boot_ghcb = NULL; + sev_cfg.ghcbs_initialized = false; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + ghcb = &data->ghcb_page; + pte = lookup_address((unsigned long)ghcb, &level); + size = page_level_size(level); + set_pte_enc(pte, level, (void *)ghcb); + snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); + } +} + static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) { int ret; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4f3fd913aadb..4f1a6d1e3f4c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -352,6 +352,8 @@ int prep_set_clr_pte_enc(pte_t *kpte, int level, int enc, void *va, unsigned long *ret_pfn, unsigned long *ret_pa, unsigned long *ret_size, pgprot_t *ret_new_prot); void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); +void snp_kexec_finish(void); +void snp_kexec_begin(void); #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -393,6 +395,8 @@ prep_set_clr_pte_enc(pte_t *kpte, int level, int enc, void *va, unsigned long *ret_pfn, unsigned long *ret_pa, unsigned long *ret_size, pgprot_t *ret_new_prot) { } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } +static inline void snp_kexec_finish(void) { } +static inline void snp_kexec_begin(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 42a35040aaf9..dec24bb08b09 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -498,6 +498,8 @@ void __init sme_early_init(void) x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required; x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required; + x86_platform.guest.enc_kexec_begin = snp_kexec_begin; + x86_platform.guest.enc_kexec_finish = snp_kexec_finish; /* * AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the -- 2.34.1 _______________________________________________ kexec mailing list kexec@xxxxxxxxxxxxxxxxxxx http://lists.infradead.org/mailman/listinfo/kexec