On 5/25/2017 11:17 PM, Xunlei Pang wrote: > On 04/19/2017 at 05:21 AM, Tom Lendacky wrote: >> Provide support so that kexec can be used to boot a kernel when SME is >> enabled. >> >> Support is needed to allocate pages for kexec without encryption. This >> is needed in order to be able to reboot in the kernel in the same manner >> as originally booted. > > Hi Tom, > > Looks like kdump will break, I didn't see the similar handling for kdump cases, see kernel: > kimage_alloc_crash_control_pages(), kimage_load_crash_segment(), etc. > > We need to support kdump with SME, kdump kernel/initramfs/purgatory/elfcorehdr/etc > are all loaded into the reserved memory(see crashkernel=X) by userspace kexec-tools. > I think a straightforward way would be to mark the whole reserved memory range without > encryption before loading all the kexec segments for kdump, I guess we can handle this > easily in arch_kexec_unprotect_crashkres(). Yes, that would work. > > Moreover, now that "elfcorehdr=X" is left as decrypted, it needs to be remapped to the > encrypted data. This is an area that I'm not familiar with, so I don't completely understand the flow in regards to where/when/how the ELF headers are copied and what needs to be done. Can you elaborate a bit on this? Thanks, Tom > > Regards, > Xunlei > >> >> Additionally, when shutting down all of the CPUs we need to be sure to >> flush the caches and then halt. This is needed when booting from a state >> where SME was not active into a state where SME is active (or vice-versa). >> Without these steps, it is possible for cache lines to exist for the same >> physical location but tagged both with and without the encryption bit. This >> can cause random memory corruption when caches are flushed depending on >> which cacheline is written last. >> >> Signed-off-by: Tom Lendacky <thomas.lendacky at amd.com> >> --- >> arch/x86/include/asm/init.h | 1 + >> arch/x86/include/asm/irqflags.h | 5 +++++ >> arch/x86/include/asm/kexec.h | 8 ++++++++ >> arch/x86/include/asm/pgtable_types.h | 1 + >> arch/x86/kernel/machine_kexec_64.c | 35 +++++++++++++++++++++++++++++++++- >> arch/x86/kernel/process.c | 26 +++++++++++++++++++++++-- >> arch/x86/mm/ident_map.c | 11 +++++++---- >> include/linux/kexec.h | 14 ++++++++++++++ >> kernel/kexec_core.c | 7 +++++++ >> 9 files changed, 101 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h >> index 737da62..b2ec511 100644 >> --- a/arch/x86/include/asm/init.h >> +++ b/arch/x86/include/asm/init.h >> @@ -6,6 +6,7 @@ struct x86_mapping_info { >> void *context; /* context for alloc_pgt_page */ >> unsigned long pmd_flag; /* page flag for PMD entry */ >> unsigned long offset; /* ident mapping offset */ >> + unsigned long kernpg_flag; /* kernel pagetable flag override */ >> }; >> >> int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, >> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h >> index ac7692d..38b5920 100644 >> --- a/arch/x86/include/asm/irqflags.h >> +++ b/arch/x86/include/asm/irqflags.h >> @@ -58,6 +58,11 @@ static inline __cpuidle void native_halt(void) >> asm volatile("hlt": : :"memory"); >> } >> >> +static inline __cpuidle void native_wbinvd_halt(void) >> +{ >> + asm volatile("wbinvd; hlt" : : : "memory"); >> +} >> + >> #endif >> >> #ifdef CONFIG_PARAVIRT >> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h >> index 70ef205..e8183ac 100644 >> --- a/arch/x86/include/asm/kexec.h >> +++ b/arch/x86/include/asm/kexec.h >> @@ -207,6 +207,14 @@ struct kexec_entry64_regs { >> uint64_t r15; >> uint64_t rip; >> }; >> + >> +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, >> + gfp_t gfp); >> +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages >> + >> +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); >> +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages >> + >> #endif >> >> typedef void crash_vmclear_fn(void); >> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h >> index ce8cb1c..0f326f4 100644 >> --- a/arch/x86/include/asm/pgtable_types.h >> +++ b/arch/x86/include/asm/pgtable_types.h >> @@ -213,6 +213,7 @@ enum page_cache_mode { >> #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) >> #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) >> #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) >> +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) >> #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) >> #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) >> #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) >> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c >> index 085c3b3..11c0ca9 100644 >> --- a/arch/x86/kernel/machine_kexec_64.c >> +++ b/arch/x86/kernel/machine_kexec_64.c >> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) >> set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); >> } >> pte = pte_offset_kernel(pmd, vaddr); >> - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); >> + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); >> return 0; >> err: >> free_transition_pgtable(image); >> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) >> .alloc_pgt_page = alloc_pgt_page, >> .context = image, >> .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, >> + .kernpg_flag = _KERNPG_TABLE_NOENC, >> }; >> unsigned long mstart, mend; >> pgd_t *level4p; >> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void) >> { >> kexec_mark_crashkres(false); >> } >> + >> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) >> +{ >> + int ret; >> + >> + if (sme_active()) { >> + /* >> + * If SME is active we need to be sure that kexec pages are >> + * not encrypted because when we boot to the new kernel the >> + * pages won't be accessed encrypted (initially). >> + */ >> + ret = set_memory_decrypted((unsigned long)vaddr, pages); >> + if (ret) >> + return ret; >> + >> + if (gfp & __GFP_ZERO) >> + memset(vaddr, 0, pages * PAGE_SIZE); >> + } >> + >> + return 0; >> +} >> + >> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) >> +{ >> + if (sme_active()) { >> + /* >> + * If SME is active we need to reset the pages back to being >> + * an encrypted mapping before freeing them. >> + */ >> + set_memory_encrypted((unsigned long)vaddr, pages); >> + } >> +} >> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c >> index 0bb8842..f4e5de6 100644 >> --- a/arch/x86/kernel/process.c >> +++ b/arch/x86/kernel/process.c >> @@ -24,6 +24,7 @@ >> #include <linux/cpuidle.h> >> #include <trace/events/power.h> >> #include <linux/hw_breakpoint.h> >> +#include <linux/kexec.h> >> #include <asm/cpu.h> >> #include <asm/apic.h> >> #include <asm/syscalls.h> >> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void) >> return ret; >> } >> #endif >> + >> void stop_this_cpu(void *dummy) >> { >> + bool do_wbinvd_halt = false; >> + >> + if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) { >> + /* >> + * If we are performing a kexec and the processor supports >> + * SME then we need to clear out cache information before >> + * halting. With kexec, going from SME inactive to SME active >> + * requires clearing cache entries so that addresses without >> + * the encryption bit set don't corrupt the same physical >> + * address that has the encryption bit set when caches are >> + * flushed. Perform a wbinvd followed by a halt to achieve >> + * this. >> + */ >> + do_wbinvd_halt = true; >> + } >> + >> local_irq_disable(); >> /* >> * Remove this CPU: >> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy) >> disable_local_APIC(); >> mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); >> >> - for (;;) >> - halt(); >> + for (;;) { >> + if (do_wbinvd_halt) >> + native_wbinvd_halt(); >> + else >> + halt(); >> + } >> } >> >> /* >> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c >> index 04210a2..2c9fd3e 100644 >> --- a/arch/x86/mm/ident_map.c >> +++ b/arch/x86/mm/ident_map.c >> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, >> static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, >> unsigned long addr, unsigned long end) >> { >> + unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE; >> unsigned long next; >> >> for (; addr < end; addr = next) { >> @@ -39,7 +40,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, >> if (!pmd) >> return -ENOMEM; >> ident_pmd_init(info, pmd, addr, next); >> - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); >> + set_pud(pud, __pud(__pa(pmd) | kernpg_flag)); >> } >> >> return 0; >> @@ -48,6 +49,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, >> static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, >> unsigned long addr, unsigned long end) >> { >> + unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE; >> unsigned long next; >> >> for (; addr < end; addr = next) { >> @@ -67,7 +69,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, >> if (!pud) >> return -ENOMEM; >> ident_pud_init(info, pud, addr, next); >> - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); >> + set_p4d(p4d, __p4d(__pa(pud) | kernpg_flag)); >> } >> >> return 0; >> @@ -76,6 +78,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, >> int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, >> unsigned long pstart, unsigned long pend) >> { >> + unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE; >> unsigned long addr = pstart + info->offset; >> unsigned long end = pend + info->offset; >> unsigned long next; >> @@ -104,14 +107,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, >> if (result) >> return result; >> if (IS_ENABLED(CONFIG_X86_5LEVEL)) { >> - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); >> + set_pgd(pgd, __pgd(__pa(p4d) | kernpg_flag)); >> } else { >> /* >> * With p4d folded, pgd is equal to p4d. >> * The pgd entry has to point to the pud page table in this case. >> */ >> pud_t *pud = pud_offset(p4d, 0); >> - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); >> + set_pgd(pgd, __pgd(__pa(pud) | kernpg_flag)); >> } >> } >> >> diff --git a/include/linux/kexec.h b/include/linux/kexec.h >> index d419d0e..1c76e3b 100644 >> --- a/include/linux/kexec.h >> +++ b/include/linux/kexec.h >> @@ -383,6 +383,20 @@ static inline void *boot_phys_to_virt(unsigned long entry) >> return phys_to_virt(boot_phys_to_phys(entry)); >> } >> >> +#ifndef arch_kexec_post_alloc_pages >> +static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, >> + gfp_t gfp) >> +{ >> + return 0; >> +} >> +#endif >> + >> +#ifndef arch_kexec_pre_free_pages >> +static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) >> +{ >> +} >> +#endif >> + >> #else /* !CONFIG_KEXEC_CORE */ >> struct pt_regs; >> struct task_struct; >> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c >> index bfe62d5..bb5e7e3 100644 >> --- a/kernel/kexec_core.c >> +++ b/kernel/kexec_core.c >> @@ -38,6 +38,7 @@ >> #include <linux/syscore_ops.h> >> #include <linux/compiler.h> >> #include <linux/hugetlb.h> >> +#include <linux/mem_encrypt.h> >> >> #include <asm/page.h> >> #include <asm/sections.h> >> @@ -315,6 +316,9 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) >> count = 1 << order; >> for (i = 0; i < count; i++) >> SetPageReserved(pages + i); >> + >> + arch_kexec_post_alloc_pages(page_address(pages), count, >> + gfp_mask); >> } >> >> return pages; >> @@ -326,6 +330,9 @@ static void kimage_free_pages(struct page *page) >> >> order = page_private(page); >> count = 1 << order; >> + >> + arch_kexec_pre_free_pages(page_address(page), count); >> + >> for (i = 0; i < count; i++) >> ClearPageReserved(page + i); >> __free_pages(page, order); >> >> >> _______________________________________________ >> kexec mailing list >> kexec at lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/kexec >