On Thu, Feb 16, 2017 at 09:48:08AM -0600, Tom Lendacky wrote: > This patch adds the support to encrypt the kernel in-place. This is > done by creating new page mappings for the kernel - a decrypted > write-protected mapping and an encrypted mapping. The kernel is encyrpted s/encyrpted/encrypted/ > by copying the kernel through a temporary buffer. "... by copying it... " > > Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx> > --- ... > +ENTRY(sme_encrypt_execute) > + > +#ifdef CONFIG_AMD_MEM_ENCRYPT > + /* > + * Entry parameters: > + * RDI - virtual address for the encrypted kernel mapping > + * RSI - virtual address for the decrypted kernel mapping > + * RDX - length of kernel > + * RCX - address of the encryption workarea , including: > + * - stack page (PAGE_SIZE) > + * - encryption routine page (PAGE_SIZE) > + * - intermediate copy buffer (PMD_PAGE_SIZE) > + * R8 - address of the pagetables to use for encryption > + */ > + > + /* Set up a one page stack in the non-encrypted memory area */ > + movq %rcx, %rax > + addq $PAGE_SIZE, %rax > + movq %rsp, %rbp %rbp is callee-saved and you're overwriting it here. You need to push it first. > + movq %rax, %rsp > + push %rbp > + > + push %r12 > + push %r13 In general, just do all pushes on function entry and the pops on exit, like the compiler does. > + movq %rdi, %r10 > + movq %rsi, %r11 > + movq %rdx, %r12 > + movq %rcx, %r13 > + > + /* Copy encryption routine into the workarea */ > + movq %rax, %rdi > + leaq .Lencrypt_start(%rip), %rsi > + movq $(.Lencrypt_stop - .Lencrypt_start), %rcx > + rep movsb > + > + /* Setup registers for call */ > + movq %r10, %rdi > + movq %r11, %rsi > + movq %r8, %rdx > + movq %r12, %rcx > + movq %rax, %r8 > + addq $PAGE_SIZE, %r8 > + > + /* Call the encryption routine */ > + call *%rax > + > + pop %r13 > + pop %r12 > + > + pop %rsp /* Restore original stack pointer */ > +.Lencrypt_exit: Please put side comments like this here: ENTRY(sme_encrypt_execute) #ifdef CONFIG_AMD_MEM_ENCRYPT /* * Entry parameters: * RDI - virtual address for the encrypted kernel mapping * RSI - virtual address for the decrypted kernel mapping * RDX - length of kernel * RCX - address of the encryption workarea * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) * - intermediate copy buffer (PMD_PAGE_SIZE) * R8 - address of the pagetables to use for encryption */ /* Set up a one page stack in the non-encrypted memory area */ movq %rcx, %rax # %rax = workarea addq $PAGE_SIZE, %rax # %rax += 4096 movq %rsp, %rbp # stash stack ptr movq %rax, %rsp # set new stack push %rbp # needs to happen before the mov %rsp, %rbp push %r12 push %r13 movq %rdi, %r10 # encrypted kernel movq %rsi, %r11 # decrypted kernel movq %rdx, %r12 # kernel length movq %rcx, %r13 # workarea ... and so on. ... > diff --git a/arch/x86/kernel/mem_encrypt_init.c b/arch/x86/kernel/mem_encrypt_init.c > index 25af15d..07cbb90 100644 > --- a/arch/x86/kernel/mem_encrypt_init.c > +++ b/arch/x86/kernel/mem_encrypt_init.c > @@ -16,9 +16,200 @@ > #ifdef CONFIG_AMD_MEM_ENCRYPT > > #include <linux/mem_encrypt.h> > +#include <linux/mm.h> > + > +#include <asm/sections.h> > + > +extern void sme_encrypt_execute(unsigned long, unsigned long, unsigned long, > + void *, pgd_t *); This belongs into mem_encrypt.h. And I think it already came up: please use names for those params. > + > +#define PGD_FLAGS _KERNPG_TABLE_NOENC > +#define PUD_FLAGS _KERNPG_TABLE_NOENC > +#define PMD_FLAGS __PAGE_KERNEL_LARGE_EXEC > + > +static void __init *sme_pgtable_entry(pgd_t *pgd, void *next_page, > + void *vaddr, pmdval_t pmd_val) > +{ sme_populate() or so sounds better. > + pud_t *pud; > + pmd_t *pmd; > + > + pgd += pgd_index((unsigned long)vaddr); > + if (pgd_none(*pgd)) { > + pud = next_page; > + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); > + native_set_pgd(pgd, > + native_make_pgd((unsigned long)pud + PGD_FLAGS)); Let it stick out, no need for those "stairs" in the vertical alignment :) > + next_page += sizeof(*pud) * PTRS_PER_PUD; > + } else { > + pud = (pud_t *)(native_pgd_val(*pgd) & ~PTE_FLAGS_MASK); > + } > + > + pud += pud_index((unsigned long)vaddr); > + if (pud_none(*pud)) { > + pmd = next_page; > + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); > + native_set_pud(pud, > + native_make_pud((unsigned long)pmd + PUD_FLAGS)); > + next_page += sizeof(*pmd) * PTRS_PER_PMD; > + } else { > + pmd = (pmd_t *)(native_pud_val(*pud) & ~PTE_FLAGS_MASK); > + } > + > + pmd += pmd_index((unsigned long)vaddr); > + if (pmd_none(*pmd) || !pmd_large(*pmd)) > + native_set_pmd(pmd, native_make_pmd(pmd_val)); > + > + return next_page; > +} > + > +static unsigned long __init sme_pgtable_calc(unsigned long start, > + unsigned long end) > +{ > + unsigned long addr, total; > + > + total = 0; > + addr = start; > + while (addr < end) { > + unsigned long pgd_end; > + > + pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE; > + if (pgd_end > end) > + pgd_end = end; > + > + total += sizeof(pud_t) * PTRS_PER_PUD * 2; > + > + while (addr < pgd_end) { > + unsigned long pud_end; > + > + pud_end = (addr & PUD_MASK) + PUD_SIZE; > + if (pud_end > end) > + pud_end = end; > + > + total += sizeof(pmd_t) * PTRS_PER_PMD * 2; That "* 2" is? > + > + addr = pud_end; So addr += PUD_SIZE; ? > + } > + > + addr = pgd_end; So addr += PGD_SIZE; ? > + total += sizeof(pgd_t) * PTRS_PER_PGD; > + > + return total; > +} > > void __init sme_encrypt_kernel(void) > { > + pgd_t *pgd; > + void *workarea, *next_page, *vaddr; > + unsigned long kern_start, kern_end, kern_len; > + unsigned long index, paddr, pmd_flags; > + unsigned long exec_size, full_size; > + > + /* If SME is not active then no need to prepare */ That comment is obvious. > + if (!sme_active()) > + return; > + > + /* Set the workarea to be after the kernel */ > + workarea = (void *)ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); > + > + /* > + * Prepare for encrypting the kernel by building new pagetables with > + * the necessary attributes needed to encrypt the kernel in place. > + * > + * One range of virtual addresses will map the memory occupied > + * by the kernel as encrypted. > + * > + * Another range of virtual addresses will map the memory occupied > + * by the kernel as decrypted and write-protected. > + * > + * The use of write-protect attribute will prevent any of the > + * memory from being cached. > + */ > + > + /* Physical address gives us the identity mapped virtual address */ > + kern_start = __pa_symbol(_text); > + kern_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE) - 1; So kern_end = (unsigned long)workarea - 1; ? Also, you can make that workarea be unsigned long and cast it to void * only when needed so that you don't need to cast it in here for the calculations. > + kern_len = kern_end - kern_start + 1; > + > + /* > + * Calculate required number of workarea bytes needed: > + * executable encryption area size: > + * stack page (PAGE_SIZE) > + * encryption routine page (PAGE_SIZE) > + * intermediate copy buffer (PMD_PAGE_SIZE) > + * pagetable structures for workarea (in case not currently mapped) > + * pagetable structures for the encryption of the kernel > + */ > + exec_size = (PAGE_SIZE * 2) + PMD_PAGE_SIZE; > + > + full_size = exec_size; > + full_size += ALIGN(exec_size, PMD_PAGE_SIZE) / PMD_PAGE_SIZE * > + sizeof(pmd_t) * PTRS_PER_PMD; > + full_size += sme_pgtable_calc(kern_start, kern_end + exec_size); > + > + next_page = workarea + exec_size; So next_page is the next free page after the workarea, correct? Because of all things, *that* certainly needs a comment. It took me a while to decipher what's going on here and I'm still not 100% clear. > + /* Make sure the current pagetables have entries for the workarea */ > + pgd = (pgd_t *)native_read_cr3(); > + paddr = (unsigned long)workarea; > + while (paddr < (unsigned long)workarea + full_size) { > + vaddr = (void *)paddr; > + next_page = sme_pgtable_entry(pgd, next_page, vaddr, > + paddr + PMD_FLAGS); > + > + paddr += PMD_PAGE_SIZE; > + } > + native_write_cr3(native_read_cr3()); Why not native_write_cr3((unsigned long)pgd); ? Now you can actually acknowledge that the code block in between changed the hierarchy in pgd and you're reloading it. > + /* Calculate a PGD index to be used for the decrypted mapping */ > + index = (pgd_index(kern_end + full_size) + 1) & (PTRS_PER_PGD - 1); > + index <<= PGDIR_SHIFT; So call it decrypt_mapping_pgd or so. index doesn't say anything. Also, move it right above where it is being used. This function is very hard to follow as it is. > + /* Set and clear the PGD */ This needs more text: we're building a new temporary pagetable which will have A, B and C mapped into it and blablabla... > + pgd = next_page; > + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); > + next_page += sizeof(*pgd) * PTRS_PER_PGD; > + > + /* Add encrypted (identity) mappings for the kernel */ > + pmd_flags = PMD_FLAGS | _PAGE_ENC; > + paddr = kern_start; > + while (paddr < kern_end) { > + vaddr = (void *)paddr; > + next_page = sme_pgtable_entry(pgd, next_page, vaddr, > + paddr + pmd_flags); > + > + paddr += PMD_PAGE_SIZE; > + } > + > + /* Add decrypted (non-identity) mappings for the kernel */ > + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); > + paddr = kern_start; > + while (paddr < kern_end) { > + vaddr = (void *)(paddr + index); > + next_page = sme_pgtable_entry(pgd, next_page, vaddr, > + paddr + pmd_flags); > + > + paddr += PMD_PAGE_SIZE; > + } > + > + /* Add the workarea to both mappings */ > + paddr = kern_end + 1; paddr = (unsigned long)workarea; Now this makes sense when I read the comment above it. > + while (paddr < (kern_end + exec_size)) { ... which actually wants that exec_size to be called workarea_size. Then it'll make more sense. And then the thing above: next_page = workarea + exec_size; would look like: next_page = workarea + workarea_size; which would make even more sense. And since you have stuff called _start and _end, you can do: next_page = workarea_start + workarea_size; and not it would make most sense. Eva! :-) > + vaddr = (void *)paddr; > + next_page = sme_pgtable_entry(pgd, next_page, vaddr, > + paddr + PMD_FLAGS); > + > + vaddr = (void *)(paddr + index); > + next_page = sme_pgtable_entry(pgd, next_page, vaddr, > + paddr + PMD_FLAGS); > + > + paddr += PMD_PAGE_SIZE; > + } > + > + /* Perform the encryption */ > + sme_encrypt_execute(kern_start, kern_start + index, kern_len, > + workarea, pgd); > + Phew, that's one tough patch to review. I'd like to review it again in your next submission. -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.