On Tue, Apr 18, 2017 at 04:21:49PM -0500, Tom Lendacky wrote: > Add the support to encrypt the kernel in-place. This is done by creating > new page mappings for the kernel - a decrypted write-protected mapping > and an encrypted mapping. The kernel is encrypted by copying it through > a temporary buffer. > > Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx> > --- > arch/x86/include/asm/mem_encrypt.h | 6 + > arch/x86/mm/Makefile | 2 > arch/x86/mm/mem_encrypt.c | 262 ++++++++++++++++++++++++++++++++++++ > arch/x86/mm/mem_encrypt_boot.S | 151 +++++++++++++++++++++ > 4 files changed, 421 insertions(+) > create mode 100644 arch/x86/mm/mem_encrypt_boot.S > > diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h > index b406df2..8f6f9b4 100644 > --- a/arch/x86/include/asm/mem_encrypt.h > +++ b/arch/x86/include/asm/mem_encrypt.h > @@ -31,6 +31,12 @@ static inline u64 sme_dma_mask(void) > return ((u64)sme_me_mask << 1) - 1; > } > > +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, > + unsigned long decrypted_kernel_vaddr, > + unsigned long kernel_len, > + unsigned long encryption_wa, > + unsigned long encryption_pgd); > + > void __init sme_early_encrypt(resource_size_t paddr, > unsigned long size); > void __init sme_early_decrypt(resource_size_t paddr, > diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile > index 9e13841..0633142 100644 > --- a/arch/x86/mm/Makefile > +++ b/arch/x86/mm/Makefile > @@ -38,3 +38,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o > obj-$(CONFIG_X86_INTEL_MPX) += mpx.o > obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o > obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o > + > +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o > diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c > index 30b07a3..0ff41a4 100644 > --- a/arch/x86/mm/mem_encrypt.c > +++ b/arch/x86/mm/mem_encrypt.c > @@ -24,6 +24,7 @@ > #include <asm/setup.h> > #include <asm/bootparam.h> > #include <asm/cacheflush.h> > +#include <asm/sections.h> > > /* > * Since SME related variables are set early in the boot process they must > @@ -216,8 +217,269 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) > set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); > } > > +void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, static > + unsigned long end) > +{ > + unsigned long addr = start; > + pgdval_t *pgd_p; > + > + while (addr < end) { > + unsigned long pgd_end; > + > + pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE; > + if (pgd_end > end) > + pgd_end = end; > + > + pgd_p = (pgdval_t *)pgd_base + pgd_index(addr); > + *pgd_p = 0; Hmm, so this is a contiguous range from [start:end] which translates to 8-byte PGD pointers in the PGD page so you can simply memset that range, no? Instead of iterating over each one? > + > + addr = pgd_end; > + } > +} > + > +#define PGD_FLAGS _KERNPG_TABLE_NOENC > +#define PUD_FLAGS _KERNPG_TABLE_NOENC > +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) > + > +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, > + unsigned long vaddr, pmdval_t pmd_val) > +{ > + pgdval_t pgd, *pgd_p; > + pudval_t pud, *pud_p; > + pmdval_t pmd, *pmd_p; You should use the enclosing type, not the underlying one. I.e., pgd_t *pgd; pud_t *pud; ... and then the macros native_p*d_val(), p*d_offset() and so on. I say native_* because we don't want to have any paravirt nastyness here. I believe your previous version was using the proper interfaces. And the kernel has gotten 5-level pagetables support in the meantime, so this'll need to start at p4d AFAICT. arch/x86/mm/fault.c::dump_pagetable() looks like a good example to stare at. > + pgd_p = (pgdval_t *)pgd_base + pgd_index(vaddr); > + pgd = *pgd_p; > + if (pgd) { > + pud_p = (pudval_t *)(pgd & ~PTE_FLAGS_MASK); > + } else { > + pud_p = pgtable_area; > + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); > + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; > + > + *pgd_p = (pgdval_t)pud_p + PGD_FLAGS; > + } > + > + pud_p += pud_index(vaddr); > + pud = *pud_p; > + if (pud) { > + if (pud & _PAGE_PSE) > + goto out; > + > + pmd_p = (pmdval_t *)(pud & ~PTE_FLAGS_MASK); > + } else { > + pmd_p = pgtable_area; > + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); > + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; > + > + *pud_p = (pudval_t)pmd_p + PUD_FLAGS; > + } > + > + pmd_p += pmd_index(vaddr); > + pmd = *pmd_p; > + if (!pmd || !(pmd & _PAGE_PSE)) > + *pmd_p = pmd_val; > + > +out: > + return pgtable_area; > +} > + > +static unsigned long __init sme_pgtable_calc(unsigned long len) > +{ > + unsigned long pud_tables, pmd_tables; > + unsigned long total = 0; > + > + /* > + * Perform a relatively simplistic calculation of the pagetable > + * entries that are needed. That mappings will be covered by 2MB > + * PMD entries so we can conservatively calculate the required > + * number of PUD and PMD structures needed to perform the mappings. > + * Incrementing the count for each covers the case where the > + * addresses cross entries. > + */ > + pud_tables = ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE; > + pud_tables++; > + pmd_tables = ALIGN(len, PUD_SIZE) / PUD_SIZE; > + pmd_tables++; > + > + total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD; > + total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD; > + > + /* > + * Now calculate the added pagetable structures needed to populate > + * the new pagetables. > + */ Nice commenting, helps following what's going on. > + pud_tables = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; > + pmd_tables = ALIGN(total, PUD_SIZE) / PUD_SIZE; > + > + total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD; > + total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD; > + > + return total; > +} > + > void __init sme_encrypt_kernel(void) > { > + pgd_t *pgd; > + void *pgtable_area; > + unsigned long kernel_start, kernel_end, kernel_len; > + unsigned long workarea_start, workarea_end, workarea_len; > + unsigned long execute_start, execute_end, execute_len; > + unsigned long pgtable_area_len; > + unsigned long decrypted_base; > + unsigned long paddr, pmd_flags; Please sort function local variables declaration in a reverse christmas tree order: <type> longest_variable_name; <type> shorter_var_name; <type> even_shorter; <type> i; > + > + if (!sme_active()) > + return; ... > diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S > new file mode 100644 > index 0000000..fb58f9f > --- /dev/null > +++ b/arch/x86/mm/mem_encrypt_boot.S > @@ -0,0 +1,151 @@ > +/* > + * AMD Memory Encryption Support > + * > + * Copyright (C) 2016 Advanced Micro Devices, Inc. > + * > + * Author: Tom Lendacky <thomas.lendacky@xxxxxxx> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/linkage.h> > +#include <asm/pgtable.h> > +#include <asm/page.h> > +#include <asm/processor-flags.h> > +#include <asm/msr-index.h> > + > + .text > + .code64 > +ENTRY(sme_encrypt_execute) > + > + /* > + * Entry parameters: > + * RDI - virtual address for the encrypted kernel mapping > + * RSI - virtual address for the decrypted kernel mapping > + * RDX - length of kernel > + * RCX - virtual address of the encryption workarea, including: > + * - stack page (PAGE_SIZE) > + * - encryption routine page (PAGE_SIZE) > + * - intermediate copy buffer (PMD_PAGE_SIZE) > + * R8 - physcial address of the pagetables to use for encryption > + */ > + > + push %rbp > + push %r12 > + > + /* Set up a one page stack in the non-encrypted memory area */ > + movq %rsp, %rbp /* Save current stack pointer */ > + movq %rcx, %rax /* Workarea stack page */ > + movq %rax, %rsp /* Set new stack pointer */ > + addq $PAGE_SIZE, %rsp /* Stack grows from the bottom */ > + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ > + > + movq %rdi, %r10 /* Encrypted kernel */ > + movq %rsi, %r11 /* Decrypted kernel */ > + movq %rdx, %r12 /* Kernel length */ > + > + /* Copy encryption routine into the workarea */ > + movq %rax, %rdi /* Workarea encryption routine */ > + leaq .Lenc_start(%rip), %rsi /* Encryption routine */ > + movq $(.Lenc_stop - .Lenc_start), %rcx /* Encryption routine length */ > + rep movsb > + > + /* Setup registers for call */ > + movq %r10, %rdi /* Encrypted kernel */ > + movq %r11, %rsi /* Decrypted kernel */ > + movq %r8, %rdx /* Pagetables used for encryption */ > + movq %r12, %rcx /* Kernel length */ > + movq %rax, %r8 /* Workarea encryption routine */ > + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ > + > + call *%rax /* Call the encryption routine */ > + > + movq %rbp, %rsp /* Restore original stack pointer */ > + > + pop %r12 > + pop %rbp > + > + ret > +ENDPROC(sme_encrypt_execute) > + > +.Lenc_start: > +ENTRY(sme_enc_routine) A function called a "routine"? Why do we need the global symbol? Nothing's referencing it AFAICT. > +/* > + * Routine used to encrypt kernel. > + * This routine must be run outside of the kernel proper since > + * the kernel will be encrypted during the process. So this > + * routine is defined here and then copied to an area outside > + * of the kernel where it will remain and run decrypted > + * during execution. > + * > + * On entry the registers must be: > + * RDI - virtual address for the encrypted kernel mapping > + * RSI - virtual address for the decrypted kernel mapping > + * RDX - address of the pagetables to use for encryption > + * RCX - length of kernel > + * R8 - intermediate copy buffer > + * > + * RAX - points to this routine > + * > + * The kernel will be encrypted by copying from the non-encrypted > + * kernel space to an intermediate buffer and then copying from the > + * intermediate buffer back to the encrypted kernel space. The physical > + * addresses of the two kernel space mappings are the same which > + * results in the kernel being encrypted "in place". > + */ > + /* Enable the new page tables */ > + mov %rdx, %cr3 > + > + /* Flush any global TLBs */ > + mov %cr4, %rdx > + andq $~X86_CR4_PGE, %rdx > + mov %rdx, %cr4 > + orq $X86_CR4_PGE, %rdx > + mov %rdx, %cr4 > + > + /* Set the PAT register PA5 entry to write-protect */ > + push %rcx > + movl $MSR_IA32_CR_PAT, %ecx > + rdmsr > + push %rdx /* Save original PAT value */ > + andl $0xffff00ff, %edx /* Clear PA5 */ > + orl $0x00000500, %edx /* Set PA5 to WP */ Maybe check first whether PA5 is already set correctly and avoid the WRMSR and the restoring below too? > + wrmsr > + pop %rdx /* RDX contains original PAT value */ > + pop %rcx > + > + movq %rcx, %r9 /* Save kernel length */ > + movq %rdi, %r10 /* Save encrypted kernel address */ > + movq %rsi, %r11 /* Save decrypted kernel address */ > + > + wbinvd /* Invalidate any cache entries */ > + > + /* Copy/encrypt 2MB at a time */ > +1: > + movq %r11, %rsi /* Source - decrypted kernel */ > + movq %r8, %rdi /* Dest - intermediate copy buffer */ > + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ > + rep movsb not movsQ? > + movq %r8, %rsi /* Source - intermediate copy buffer */ > + movq %r10, %rdi /* Dest - encrypted kernel */ > + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ > + rep movsb > + > + addq $PMD_PAGE_SIZE, %r11 > + addq $PMD_PAGE_SIZE, %r10 > + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ > + jnz 1b /* Kernel length not zero? */ > + > + /* Restore PAT register */ > + push %rdx /* Save original PAT value */ > + movl $MSR_IA32_CR_PAT, %ecx > + rdmsr > + pop %rdx /* Restore original PAT value */ > + wrmsr > + > + ret > +ENDPROC(sme_enc_routine) > +.Lenc_stop: > -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.