On 05/31/2017 at 01:46 AM, Tom Lendacky wrote: > On 5/25/2017 11:17 PM, Xunlei Pang wrote: >> On 04/19/2017 at 05:21 AM, Tom Lendacky wrote: >>> Provide support so that kexec can be used to boot a kernel when SME is >>> enabled. >>> >>> Support is needed to allocate pages for kexec without encryption. This >>> is needed in order to be able to reboot in the kernel in the same manner >>> as originally booted. >> >> Hi Tom, >> >> Looks like kdump will break, I didn't see the similar handling for kdump cases, see kernel: >> kimage_alloc_crash_control_pages(), kimage_load_crash_segment(), etc. > >> We need to support kdump with SME, kdump kernel/initramfs/purgatory/elfcorehdr/etc >> are all loaded into the reserved memory(see crashkernel=X) by userspace kexec-tools. >> I think a straightforward way would be to mark the whole reserved memory range without >> encryption before loading all the kexec segments for kdump, I guess we can handle this >> easily in arch_kexec_unprotect_crashkres(). > > Yes, that would work. > >> >> Moreover, now that "elfcorehdr=X" is left as decrypted, it needs to be remapped to the >> encrypted data. > > This is an area that I'm not familiar with, so I don't completely > understand the flow in regards to where/when/how the ELF headers are > copied and what needs to be done. > > Can you elaborate a bit on this? "elfcorehdr" is generated by userspace kexec-tools(git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git), it's actually ELF CORE header data(elf header, PT_LOAD/PT_NOTE program header), see kexec/crashdump-elf.c::FUNC(). For kdump case, it will be put in some reserved crash memory allocated by kexec-tools, and passed the corresponding start address of the allocated reserved crash memory to kdump kernel via "elfcorehdr=", please see kernel functions setup_elfcorehdr() and vmcore_init() for how it is parsed by kdump kernel. Regards, Xunlei >> >>> >>> Additionally, when shutting down all of the CPUs we need to be sure to >>> flush the caches and then halt. This is needed when booting from a state >>> where SME was not active into a state where SME is active (or vice-versa). >>> Without these steps, it is possible for cache lines to exist for the same >>> physical location but tagged both with and without the encryption bit. This >>> can cause random memory corruption when caches are flushed depending on >>> which cacheline is written last. >>> >>> Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx> >>> --- >>> arch/x86/include/asm/init.h | 1 + >>> arch/x86/include/asm/irqflags.h | 5 +++++ >>> arch/x86/include/asm/kexec.h | 8 ++++++++ >>> arch/x86/include/asm/pgtable_types.h | 1 + >>> arch/x86/kernel/machine_kexec_64.c | 35 +++++++++++++++++++++++++++++++++- >>> arch/x86/kernel/process.c | 26 +++++++++++++++++++++++-- >>> arch/x86/mm/ident_map.c | 11 +++++++---- >>> include/linux/kexec.h | 14 ++++++++++++++ >>> kernel/kexec_core.c | 7 +++++++ >>> 9 files changed, 101 insertions(+), 7 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h >>> index 737da62..b2ec511 100644 >>> --- a/arch/x86/include/asm/init.h >>> +++ b/arch/x86/include/asm/init.h >>> @@ -6,6 +6,7 @@ struct x86_mapping_info { >>> void *context; /* context for alloc_pgt_page */ >>> unsigned long pmd_flag; /* page flag for PMD entry */ >>> unsigned long offset; /* ident mapping offset */ >>> + unsigned long kernpg_flag; /* kernel pagetable flag override */ >>> }; >>> int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, >>> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h >>> index ac7692d..38b5920 100644 >>> --- a/arch/x86/include/asm/irqflags.h >>> +++ b/arch/x86/include/asm/irqflags.h >>> @@ -58,6 +58,11 @@ static inline __cpuidle void native_halt(void) >>> asm volatile("hlt": : :"memory"); >>> } >>> +static inline __cpuidle void native_wbinvd_halt(void) >>> +{ >>> + asm volatile("wbinvd; hlt" : : : "memory"); >>> +} >>> + >>> #endif >>> #ifdef CONFIG_PARAVIRT >>> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h >>> index 70ef205..e8183ac 100644 >>> --- a/arch/x86/include/asm/kexec.h >>> +++ b/arch/x86/include/asm/kexec.h >>> @@ -207,6 +207,14 @@ struct kexec_entry64_regs { >>> uint64_t r15; >>> uint64_t rip; >>> }; >>> + >>> +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, >>> + gfp_t gfp); >>> +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages >>> + >>> +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); >>> +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages >>> + >>> #endif >>> typedef void crash_vmclear_fn(void); >>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h >>> index ce8cb1c..0f326f4 100644 >>> --- a/arch/x86/include/asm/pgtable_types.h >>> +++ b/arch/x86/include/asm/pgtable_types.h >>> @@ -213,6 +213,7 @@ enum page_cache_mode { >>> #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) >>> #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) >>> #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) >>> +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) >>> #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) >>> #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) >>> #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) >>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c >>> index 085c3b3..11c0ca9 100644 >>> --- a/arch/x86/kernel/machine_kexec_64.c >>> +++ b/arch/x86/kernel/machine_kexec_64.c >>> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) >>> set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); >>> } >>> pte = pte_offset_kernel(pmd, vaddr); >>> - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); >>> + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); >>> return 0; >>> err: >>> free_transition_pgtable(image); >>> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) >>> .alloc_pgt_page = alloc_pgt_page, >>> .context = image, >>> .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, >>> + .kernpg_flag = _KERNPG_TABLE_NOENC, >>> }; >>> unsigned long mstart, mend; >>> pgd_t *level4p; >>> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void) >>> { >>> kexec_mark_crashkres(false); >>> } >>> + >>> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) >>> +{ >>> + int ret; >>> + >>> + if (sme_active()) { >>> + /* >>> + * If SME is active we need to be sure that kexec pages are >>> + * not encrypted because when we boot to the new kernel the >>> + * pages won't be accessed encrypted (initially). >>> + */ >>> + ret = set_memory_decrypted((unsigned long)vaddr, pages); >>> + if (ret) >>> + return ret; >>> + >>> + if (gfp & __GFP_ZERO) >>> + memset(vaddr, 0, pages * PAGE_SIZE); >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) >>> +{ >>> + if (sme_active()) { >>> + /* >>> + * If SME is active we need to reset the pages back to being >>> + * an encrypted mapping before freeing them. >>> + */ >>> + set_memory_encrypted((unsigned long)vaddr, pages); >>> + } >>> +} >>> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c >>> index 0bb8842..f4e5de6 100644 >>> --- a/arch/x86/kernel/process.c >>> +++ b/arch/x86/kernel/process.c >>> @@ -24,6 +24,7 @@ >>> #include <linux/cpuidle.h> >>> #include <trace/events/power.h> >>> #include <linux/hw_breakpoint.h> >>> +#include <linux/kexec.h> >>> #include <asm/cpu.h> >>> #include <asm/apic.h> >>> #include <asm/syscalls.h> >>> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void) >>> return ret; >>> } >>> #endif >>> + >>> void stop_this_cpu(void *dummy) >>> { >>> + bool do_wbinvd_halt = false; >>> + >>> + if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) { >>> + /* >>> + * If we are performing a kexec and the processor supports >>> + * SME then we need to clear out cache information before >>> + * halting. With kexec, going from SME inactive to SME active >>> + * requires clearing cache entries so that addresses without >>> + * the encryption bit set don't corrupt the same physical >>> + * address that has the encryption bit set when caches are >>> + * flushed. Perform a wbinvd followed by a halt to achieve >>> + * this. >>> + */ >>> + do_wbinvd_halt = true; >>> + } >>> + >>> local_irq_disable(); >>> /* >>> * Remove this CPU: >>> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy) >>> disable_local_APIC(); >>> mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); >>> - for (;;) >>> - halt(); >>> + for (;;) { >>> + if (do_wbinvd_halt) >>> + native_wbinvd_halt(); >>> + else >>> + halt(); >>> + } >>> } >>> /* >>> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c >>> index 04210a2..2c9fd3e 100644 >>> --- a/arch/x86/mm/ident_map.c >>> +++ b/arch/x86/mm/ident_map.c >>> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, >>> static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, >>> unsigned long addr, unsigned long end) >>> { >>> + unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE; >>> unsigned long next; >>> for (; addr < end; addr = next) { >>> @@ -39,7 +40,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, >>> if (!pmd) >>> return -ENOMEM; >>> ident_pmd_init(info, pmd, addr, next); >>> - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); >>> + set_pud(pud, __pud(__pa(pmd) | kernpg_flag)); >>> } >>> return 0; >>> @@ -48,6 +49,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, >>> static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, >>> unsigned long addr, unsigned long end) >>> { >>> + unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE; >>> unsigned long next; >>> for (; addr < end; addr = next) { >>> @@ -67,7 +69,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, >>> if (!pud) >>> return -ENOMEM; >>> ident_pud_init(info, pud, addr, next); >>> - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); >>> + set_p4d(p4d, __p4d(__pa(pud) | kernpg_flag)); >>> } >>> return 0; >>> @@ -76,6 +78,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, >>> int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, >>> unsigned long pstart, unsigned long pend) >>> { >>> + unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE; >>> unsigned long addr = pstart + info->offset; >>> unsigned long end = pend + info->offset; >>> unsigned long next; >>> @@ -104,14 +107,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, >>> if (result) >>> return result; >>> if (IS_ENABLED(CONFIG_X86_5LEVEL)) { >>> - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); >>> + set_pgd(pgd, __pgd(__pa(p4d) | kernpg_flag)); >>> } else { >>> /* >>> * With p4d folded, pgd is equal to p4d. >>> * The pgd entry has to point to the pud page table in this case. >>> */ >>> pud_t *pud = pud_offset(p4d, 0); >>> - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); >>> + set_pgd(pgd, __pgd(__pa(pud) | kernpg_flag)); >>> } >>> } >>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h >>> index d419d0e..1c76e3b 100644 >>> --- a/include/linux/kexec.h >>> +++ b/include/linux/kexec.h >>> @@ -383,6 +383,20 @@ static inline void *boot_phys_to_virt(unsigned long entry) >>> return phys_to_virt(boot_phys_to_phys(entry)); >>> } >>> +#ifndef arch_kexec_post_alloc_pages >>> +static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, >>> + gfp_t gfp) >>> +{ >>> + return 0; >>> +} >>> +#endif >>> + >>> +#ifndef arch_kexec_pre_free_pages >>> +static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) >>> +{ >>> +} >>> +#endif >>> + >>> #else /* !CONFIG_KEXEC_CORE */ >>> struct pt_regs; >>> struct task_struct; >>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c >>> index bfe62d5..bb5e7e3 100644 >>> --- a/kernel/kexec_core.c >>> +++ b/kernel/kexec_core.c >>> @@ -38,6 +38,7 @@ >>> #include <linux/syscore_ops.h> >>> #include <linux/compiler.h> >>> #include <linux/hugetlb.h> >>> +#include <linux/mem_encrypt.h> >>> #include <asm/page.h> >>> #include <asm/sections.h> >>> @@ -315,6 +316,9 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) >>> count = 1 << order; >>> for (i = 0; i < count; i++) >>> SetPageReserved(pages + i); >>> + >>> + arch_kexec_post_alloc_pages(page_address(pages), count, >>> + gfp_mask); >>> } >>> return pages; >>> @@ -326,6 +330,9 @@ static void kimage_free_pages(struct page *page) >>> order = page_private(page); >>> count = 1 << order; >>> + >>> + arch_kexec_pre_free_pages(page_address(page), count); >>> + >>> for (i = 0; i < count; i++) >>> ClearPageReserved(page + i); >>> __free_pages(page, order); >>> >>> >>> _______________________________________________ >>> kexec mailing list >>> kexec@xxxxxxxxxxxxxxxxxxx >>> http://lists.infradead.org/mailman/listinfo/kexec >>