On Tue, Aug 28, 2018 at 05:12:56PM -0500, Brijesh Singh wrote: > kvmclock defines few static variables which are shared with hypervisor ... with the hypervisor ... > during the kvmclock initialization. > > When SEV is active, memory is encrypted with a guest-specific key, and > if guest OS wants to share the memory region with hypervisor then it must > clear the C-bit before sharing it. Currently, we use > kernel_physical_mapping_init() to split large pages before clearing the > C-bit on shared pages. But the kernel_physical_mapping_init fails when "But it fails when... " > called from the kvmclock initialization (mainly because memblock allocator > was not ready). "... is not ready that early during boot)." > The '__decrypted' can be used to define a shared variable; the variables "Add a __decrypted section attribute which can be used when defining such shared variable. The so-defined variables will be placed..." > will be put in the .data.decryption section. This section is mapped with " ... in the .data..decrypted section." > C=0 early in the boot, we also ensure that the initialized values are "... early during boot, " > updated to match with C=0 (i.e perform an in-place decryption). The > .data..decrypted section is PMD aligned and sized so that we avoid the "... PMD-aligned ... " > need to split the large pages when mapping this section. > > The sme_encrypt_kernel() was used to perform the in-place encryption > of the Linux kernel and initrd when SME is active. The routine has been > enhanced to decrypt the .data..decryption section for both SME and SEV > cases. ".data..decrypted" > > While reusing the sme_populate_pgd() we found that the function does not > update the flags if the pte/pmd entry already exists. The patch updates > the function to take care of it. Change the tone to impartial: "While at it, fix sme_populate_pgd() to update page flags if the PMD/PTE entry already exists." And avoid using "This patch" - what this patch does, should be visible to the enlightened onlooker. > Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx > Cc: Tom Lendacky <thomas.lendacky@xxxxxxx> > Cc: kvm@xxxxxxxxxxxxxxx > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Cc: Borislav Petkov <bp@xxxxxxx> > Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> > Cc: linux-kernel@xxxxxxxxxxxxxxx > Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> > Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> > Cc: kvm@xxxxxxxxxxxxxxx > Cc: "Radim Krčmář" <rkrcmar@xxxxxxxxxx> > --- > arch/x86/include/asm/mem_encrypt.h | 6 +++ > arch/x86/kernel/head64.c | 9 ++++ > arch/x86/kernel/vmlinux.lds.S | 17 +++++++ > arch/x86/mm/mem_encrypt_identity.c | 100 +++++++++++++++++++++++++++++-------- > 4 files changed, 112 insertions(+), 20 deletions(-) > > diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h > index c064383..802b2eb 100644 > --- a/arch/x86/include/asm/mem_encrypt.h > +++ b/arch/x86/include/asm/mem_encrypt.h > @@ -52,6 +52,8 @@ void __init mem_encrypt_init(void); > bool sme_active(void); > bool sev_active(void); > > +#define __decrypted __attribute__((__section__(".data..decrypted"))) > + > #else /* !CONFIG_AMD_MEM_ENCRYPT */ > > #define sme_me_mask 0ULL > @@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; > static inline int __init > early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } > > +#define __decrypted > + > #endif /* CONFIG_AMD_MEM_ENCRYPT */ > > /* > @@ -88,6 +92,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; > #define __sme_pa(x) (__pa(x) | sme_me_mask) > #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) > > +extern char __start_data_decrypted[], __end_data_decrypted[]; > + > #endif /* __ASSEMBLY__ */ > > #endif /* __X86_MEM_ENCRYPT_H__ */ > diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c > index 8047379..3e03129 100644 > --- a/arch/x86/kernel/head64.c > +++ b/arch/x86/kernel/head64.c > @@ -112,6 +112,7 @@ static bool __head check_la57_support(unsigned long physaddr) > unsigned long __head __startup_64(unsigned long physaddr, > struct boot_params *bp) > { > + unsigned long vaddr, vaddr_end; > unsigned long load_delta, *p; > unsigned long pgtable_flags; > pgdval_t *pgd; > @@ -234,6 +235,14 @@ unsigned long __head __startup_64(unsigned long physaddr, > /* Encrypt the kernel and related (if SME is active) */ > sme_encrypt_kernel(bp); > > + /* Clear the memory encryption mask from the decrypted section */ End sentences with a fullstop. > + vaddr = (unsigned long)__start_data_decrypted; > + vaddr_end = (unsigned long)__end_data_decrypted; > + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { > + i = pmd_index(vaddr); > + pmd[i] -= sme_get_me_mask(); > + } This needs to be no-op on !SME machines. Hint: if (sme_active())... > /* > * Return the SME encryption mask (if SME is active) to be used as a > * modifier for the initial pgdir entry programmed into CR3. > diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S > index 8bde0a4..0ef9320 100644 > --- a/arch/x86/kernel/vmlinux.lds.S > +++ b/arch/x86/kernel/vmlinux.lds.S > @@ -89,6 +89,21 @@ PHDRS { > note PT_NOTE FLAGS(0); /* ___ */ > } > > +/* > + * This section contains data which will be mapped as decrypted. Memory > + * encryption operates on a page basis. But we make this section a pmd "... make this section PMD-aligned ..." Also, avoid the "we" and formulate in passive voice. > + * aligned to avoid spliting the pages while mapping the section early. > + * > + * Note: We use a separate section so that only this section gets > + * decrypted to avoid exposing more than we wish. > + */ > +#define DATA_DECRYPTED_SECTION \ DATA_DECRYPTED is perfectly fine. > + . = ALIGN(PMD_SIZE); \ > + __start_data_decrypted = .; \ > + *(.data..decrypted); \ > + . = ALIGN(PMD_SIZE); \ > + __end_data_decrypted = .; \ > + > SECTIONS > { > #ifdef CONFIG_X86_32 > @@ -171,6 +186,8 @@ SECTIONS > /* rarely changed data like cpu maps */ > READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) > > + DATA_DECRYPTED_SECTION > + > /* End of data section */ > _edata = .; > } :data > diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c > index bf6097e..88c1cce 100644 > --- a/arch/x86/mm/mem_encrypt_identity.c > +++ b/arch/x86/mm/mem_encrypt_identity.c > @@ -51,6 +51,8 @@ > (_PAGE_PAT | _PAGE_PWT)) > > #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) > +#define PMD_FLAGS_ENC_WP ((PMD_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \ > + (_PAGE_PAT | _PAGE_PWT)) > > #define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) > > @@ -59,6 +61,8 @@ > (_PAGE_PAT | _PAGE_PWT)) > > #define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) > +#define PTE_FLAGS_ENC_WP ((PTE_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \ > + (_PAGE_PAT | _PAGE_PWT)) > > struct sme_populate_pgd_data { > void *pgtable_area; > @@ -154,9 +158,6 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) > return; > > pmd = pmd_offset(pud, ppd->vaddr); > - if (pmd_large(*pmd)) > - return; > - > set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); > } > > @@ -182,8 +183,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) > return; > > pte = pte_offset_map(pmd, ppd->vaddr); > - if (pte_none(*pte)) > - set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); > + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); > } > This looks like it belongs in a prepatch fix. > static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) > @@ -235,6 +235,11 @@ static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) > __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); > } > > +static void __init sme_map_range_encrypted_wp(struct sme_populate_pgd_data *ppd) > +{ > + __sme_map_range(ppd, PMD_FLAGS_ENC_WP, PTE_FLAGS_ENC_WP); > +} > + > static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) > { > __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); These changes with the _WP flags and helper addition belong in a pre-patch. > @@ -382,7 +387,10 @@ static void __init build_workarea_map(struct boot_params *bp, > ppd->paddr = workarea_start; > ppd->vaddr = workarea_start; > ppd->vaddr_end = workarea_end; > - sme_map_range_decrypted(ppd); > + if (sev_active()) > + sme_map_range_encrypted(ppd); > + else > + sme_map_range_decrypted(ppd); > > /* Flush the TLB - no globals so cr3 is enough */ > native_write_cr3(__native_read_cr3()); > @@ -439,16 +447,27 @@ static void __init build_workarea_map(struct boot_params *bp, > sme_map_range_decrypted_wp(ppd); > } > > - /* Add decrypted workarea mappings to both kernel mappings */ > + /* > + * When SEV is active, kernel is already encrypted hence mapping > + * the initial workarea_start as encrypted. When SME is active, > + * the kernel is not encrypted hence add a decrypted workarea s/ a// > + * mappings to both kernel mappings. > + */ > ppd->paddr = workarea_start; > ppd->vaddr = workarea_start; > ppd->vaddr_end = workarea_end; > - sme_map_range_decrypted(ppd); > + if (sev_active()) > + sme_map_range_encrypted(ppd); > + else > + sme_map_range_decrypted(ppd); > > ppd->paddr = workarea_start; > ppd->vaddr = workarea_start + decrypted_base; > ppd->vaddr_end = workarea_end + decrypted_base; > - sme_map_range_decrypted(ppd); > + if (sev_active()) > + sme_map_range_encrypted(ppd); > + else > + sme_map_range_decrypted(ppd); > > wa->kernel_start = kernel_start; > wa->kernel_end = kernel_end; > @@ -491,28 +510,69 @@ static void __init remove_workarea_map(struct sme_workarea_data *wa, > native_write_cr3(__native_read_cr3()); > } > > +static void __init decrypt_data_decrypted_section(struct sme_workarea_data *wa, That function name could use some clarifying change... > + struct sme_populate_pgd_data *ppd) > +{ > + unsigned long decrypted_start, decrypted_end, decrypted_len; > + > + /* Physical addresses of decrypted data section */ > + decrypted_start = __pa_symbol(__start_data_decrypted); > + decrypted_end = ALIGN(__pa_symbol(__end_data_decrypted), PMD_PAGE_SIZE); > + decrypted_len = decrypted_end - decrypted_start; > + > + if (!decrypted_len) > + return; > + > + /* Add decrypted mapping for the section (identity) */ > + ppd->paddr = decrypted_start; > + ppd->vaddr = decrypted_start; > + ppd->vaddr_end = decrypted_end; > + sme_map_range_decrypted(ppd); > + > + /* Add encrypted-wp mapping for the section (non-identity) */ > + ppd->paddr = decrypted_start; > + ppd->vaddr = decrypted_start + wa->decrypted_base; > + ppd->vaddr_end = decrypted_end + wa->decrypted_base; > + sme_map_range_encrypted_wp(ppd); > + > + /* Perform in-place decryption */ > + sme_encrypt_execute(decrypted_start, > + decrypted_start + wa->decrypted_base, > + decrypted_len, wa->workarea_start, > + (unsigned long)ppd->pgd); > + > + ppd->vaddr = decrypted_start + wa->decrypted_base; > + ppd->vaddr_end = decrypted_end + wa->decrypted_base; > + sme_clear_pgd(ppd); > +} > + > void __init sme_encrypt_kernel(struct boot_params *bp) > { > struct sme_populate_pgd_data ppd; > struct sme_workarea_data wa; > > - if (!sme_active()) > + if (!mem_encrypt_active()) > return; > > build_workarea_map(bp, &wa, &ppd); > > - /* When SEV is active, encrypt kernel and initrd */ > - sme_encrypt_execute(wa.kernel_start, > - wa.kernel_start + wa.decrypted_base, > - wa.kernel_len, wa.workarea_start, > - (unsigned long)ppd.pgd); > - > - if (wa.initrd_len) > - sme_encrypt_execute(wa.initrd_start, > - wa.initrd_start + wa.decrypted_base, > - wa.initrd_len, wa.workarea_start, > + /* When SME is active, encrypt kernel and initrd */ > + if (sme_active()) { > + sme_encrypt_execute(wa.kernel_start, > + wa.kernel_start + wa.decrypted_base, > + wa.kernel_len, wa.workarea_start, > (unsigned long)ppd.pgd); > > + if (wa.initrd_len) > + sme_encrypt_execute(wa.initrd_start, > + wa.initrd_start + wa.decrypted_base, > + wa.initrd_len, wa.workarea_start, > + (unsigned long)ppd.pgd); > + } > + > + /* Decrypt the contents of .data..decrypted section */ > + decrypt_data_decrypted_section(&wa, &ppd); > + > remove_workarea_map(&wa, &ppd); > } > > -- > 2.7.4 > -- Regards/Gruss, Boris. SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg) --