Re: [PATCH v5 29/32] x86/mm: Add support to encrypt the kernel in-place

Borislav Petkov <bp@xxxxxxxxx> · Thu, 18 May 2017 14:46:27 +0200

On Tue, Apr 18, 2017 at 04:21:49PM -0500, Tom Lendacky wrote:
> Add the support to encrypt the kernel in-place. This is done by creating
> new page mappings for the kernel - a decrypted write-protected mapping
> and an encrypted mapping. The kernel is encrypted by copying it through
> a temporary buffer.
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
> ---
>  arch/x86/include/asm/mem_encrypt.h |    6 +
>  arch/x86/mm/Makefile               |    2 
>  arch/x86/mm/mem_encrypt.c          |  262 ++++++++++++++++++++++++++++++++++++
>  arch/x86/mm/mem_encrypt_boot.S     |  151 +++++++++++++++++++++
>  4 files changed, 421 insertions(+)
>  create mode 100644 arch/x86/mm/mem_encrypt_boot.S
> 
> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
> index b406df2..8f6f9b4 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -31,6 +31,12 @@ static inline u64 sme_dma_mask(void)
>  	return ((u64)sme_me_mask << 1) - 1;
>  }
>  
> +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
> +			 unsigned long decrypted_kernel_vaddr,
> +			 unsigned long kernel_len,
> +			 unsigned long encryption_wa,
> +			 unsigned long encryption_pgd);
> +
>  void __init sme_early_encrypt(resource_size_t paddr,
>  			      unsigned long size);
>  void __init sme_early_decrypt(resource_size_t paddr,
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 9e13841..0633142 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -38,3 +38,5 @@ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
>  obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
>  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
>  obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
> +
> +obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 30b07a3..0ff41a4 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -24,6 +24,7 @@
>  #include <asm/setup.h>
>  #include <asm/bootparam.h>
>  #include <asm/cacheflush.h>
> +#include <asm/sections.h>
>  
>  /*
>   * Since SME related variables are set early in the boot process they must
> @@ -216,8 +217,269 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
>  	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
>  }
>  
> +void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,

static

> +			  unsigned long end)
> +{
> +	unsigned long addr = start;
> +	pgdval_t *pgd_p;
> +
> +	while (addr < end) {
> +		unsigned long pgd_end;
> +
> +		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
> +		if (pgd_end > end)
> +			pgd_end = end;
> +
> +		pgd_p = (pgdval_t *)pgd_base + pgd_index(addr);
> +		*pgd_p = 0;

Hmm, so this is a contiguous range from [start:end] which translates to
8-byte PGD pointers in the PGD page so you can simply memset that range,
no?

Instead of iterating over each one?

> +
> +		addr = pgd_end;
> +	}
> +}
> +
> +#define PGD_FLAGS	_KERNPG_TABLE_NOENC
> +#define PUD_FLAGS	_KERNPG_TABLE_NOENC
> +#define PMD_FLAGS	(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
> +
> +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
> +				     unsigned long vaddr, pmdval_t pmd_val)
> +{
> +	pgdval_t pgd, *pgd_p;
> +	pudval_t pud, *pud_p;
> +	pmdval_t pmd, *pmd_p;

You should use the enclosing type, not the underlying one. I.e.,

	pgd_t *pgd;
	pud_t *pud;
	...

and then the macros native_p*d_val(), p*d_offset() and so on. I say
native_* because we don't want to have any paravirt nastyness here.
I believe your previous version was using the proper interfaces.

And the kernel has gotten 5-level pagetables support in
the meantime, so this'll need to start at p4d AFAICT.
arch/x86/mm/fault.c::dump_pagetable() looks like a good example to stare
at.

> +	pgd_p = (pgdval_t *)pgd_base + pgd_index(vaddr);
> +	pgd = *pgd_p;
> +	if (pgd) {
> +		pud_p = (pudval_t *)(pgd & ~PTE_FLAGS_MASK);
> +	} else {
> +		pud_p = pgtable_area;
> +		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> +		pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> +
> +		*pgd_p = (pgdval_t)pud_p + PGD_FLAGS;
> +	}
> +
> +	pud_p += pud_index(vaddr);
> +	pud = *pud_p;
> +	if (pud) {
> +		if (pud & _PAGE_PSE)
> +			goto out;
> +
> +		pmd_p = (pmdval_t *)(pud & ~PTE_FLAGS_MASK);
> +	} else {
> +		pmd_p = pgtable_area;
> +		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
> +		pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
> +
> +		*pud_p = (pudval_t)pmd_p + PUD_FLAGS;
> +	}
> +
> +	pmd_p += pmd_index(vaddr);
> +	pmd = *pmd_p;
> +	if (!pmd || !(pmd & _PAGE_PSE))
> +		*pmd_p = pmd_val;
> +
> +out:
> +	return pgtable_area;
> +}
> +
> +static unsigned long __init sme_pgtable_calc(unsigned long len)
> +{
> +	unsigned long pud_tables, pmd_tables;
> +	unsigned long total = 0;
> +
> +	/*
> +	 * Perform a relatively simplistic calculation of the pagetable
> +	 * entries that are needed. That mappings will be covered by 2MB
> +	 * PMD entries so we can conservatively calculate the required
> +	 * number of PUD and PMD structures needed to perform the mappings.
> +	 * Incrementing the count for each covers the case where the
> +	 * addresses cross entries.
> +	 */
> +	pud_tables = ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE;
> +	pud_tables++;
> +	pmd_tables = ALIGN(len, PUD_SIZE) / PUD_SIZE;
> +	pmd_tables++;
> +
> +	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
> +	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
> +
> +	/*
> +	 * Now calculate the added pagetable structures needed to populate
> +	 * the new pagetables.
> +	 */

Nice commenting, helps following what's going on.

> +	pud_tables = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> +	pmd_tables = ALIGN(total, PUD_SIZE) / PUD_SIZE;
> +
> +	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
> +	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
> +
> +	return total;
> +}
> +
>  void __init sme_encrypt_kernel(void)
>  {
> +	pgd_t *pgd;
> +	void *pgtable_area;
> +	unsigned long kernel_start, kernel_end, kernel_len;
> +	unsigned long workarea_start, workarea_end, workarea_len;
> +	unsigned long execute_start, execute_end, execute_len;
> +	unsigned long pgtable_area_len;
> +	unsigned long decrypted_base;
> +	unsigned long paddr, pmd_flags;

Please sort function local variables declaration in a reverse christmas
tree order:

	<type> longest_variable_name;
	<type> shorter_var_name;
	<type> even_shorter;
	<type> i;

> +
> +	if (!sme_active())
> +		return;

...

> diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
> new file mode 100644
> index 0000000..fb58f9f
> --- /dev/null
> +++ b/arch/x86/mm/mem_encrypt_boot.S
> @@ -0,0 +1,151 @@
> +/*
> + * AMD Memory Encryption Support
> + *
> + * Copyright (C) 2016 Advanced Micro Devices, Inc.
> + *
> + * Author: Tom Lendacky <thomas.lendacky@xxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/pgtable.h>
> +#include <asm/page.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr-index.h>
> +
> +	.text
> +	.code64
> +ENTRY(sme_encrypt_execute)
> +
> +	/*
> +	 * Entry parameters:
> +	 *   RDI - virtual address for the encrypted kernel mapping
> +	 *   RSI - virtual address for the decrypted kernel mapping
> +	 *   RDX - length of kernel
> +	 *   RCX - virtual address of the encryption workarea, including:
> +	 *     - stack page (PAGE_SIZE)
> +	 *     - encryption routine page (PAGE_SIZE)
> +	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
> +	 *    R8 - physcial address of the pagetables to use for encryption
> +	 */
> +
> +	push	%rbp
> +	push	%r12
> +
> +	/* Set up a one page stack in the non-encrypted memory area */
> +	movq	%rsp, %rbp		/* Save current stack pointer */
> +	movq	%rcx, %rax		/* Workarea stack page */
> +	movq	%rax, %rsp		/* Set new stack pointer */
> +	addq	$PAGE_SIZE, %rsp	/* Stack grows from the bottom */
> +	addq	$PAGE_SIZE, %rax	/* Workarea encryption routine */
> +
> +	movq	%rdi, %r10		/* Encrypted kernel */
> +	movq	%rsi, %r11		/* Decrypted kernel */
> +	movq	%rdx, %r12		/* Kernel length */
> +
> +	/* Copy encryption routine into the workarea */
> +	movq	%rax, %rdi		/* Workarea encryption routine */
> +	leaq	.Lenc_start(%rip), %rsi	/* Encryption routine */
> +	movq	$(.Lenc_stop - .Lenc_start), %rcx	/* Encryption routine length */
> +	rep	movsb
> +
> +	/* Setup registers for call */
> +	movq	%r10, %rdi		/* Encrypted kernel */
> +	movq	%r11, %rsi		/* Decrypted kernel */
> +	movq	%r8, %rdx		/* Pagetables used for encryption */
> +	movq	%r12, %rcx		/* Kernel length */
> +	movq	%rax, %r8		/* Workarea encryption routine */
> +	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
> +
> +	call	*%rax			/* Call the encryption routine */
> +
> +	movq	%rbp, %rsp		/* Restore original stack pointer */
> +
> +	pop	%r12
> +	pop	%rbp
> +
> +	ret
> +ENDPROC(sme_encrypt_execute)
> +
> +.Lenc_start:
> +ENTRY(sme_enc_routine)

A function called a "routine"? Why do we need the global symbol?
Nothing's referencing it AFAICT.

> +/*
> + * Routine used to encrypt kernel.
> + *   This routine must be run outside of the kernel proper since
> + *   the kernel will be encrypted during the process. So this
> + *   routine is defined here and then copied to an area outside
> + *   of the kernel where it will remain and run decrypted
> + *   during execution.
> + *
> + *   On entry the registers must be:
> + *     RDI - virtual address for the encrypted kernel mapping
> + *     RSI - virtual address for the decrypted kernel mapping
> + *     RDX - address of the pagetables to use for encryption
> + *     RCX - length of kernel
> + *      R8 - intermediate copy buffer
> + *
> + *     RAX - points to this routine
> + *
> + * The kernel will be encrypted by copying from the non-encrypted
> + * kernel space to an intermediate buffer and then copying from the
> + * intermediate buffer back to the encrypted kernel space. The physical
> + * addresses of the two kernel space mappings are the same which
> + * results in the kernel being encrypted "in place".
> + */
> +	/* Enable the new page tables */
> +	mov	%rdx, %cr3
> +
> +	/* Flush any global TLBs */
> +	mov	%cr4, %rdx
> +	andq	$~X86_CR4_PGE, %rdx
> +	mov	%rdx, %cr4
> +	orq	$X86_CR4_PGE, %rdx
> +	mov	%rdx, %cr4
> +
> +	/* Set the PAT register PA5 entry to write-protect */
> +	push	%rcx
> +	movl	$MSR_IA32_CR_PAT, %ecx
> +	rdmsr
> +	push	%rdx			/* Save original PAT value */
> +	andl	$0xffff00ff, %edx	/* Clear PA5 */
> +	orl	$0x00000500, %edx	/* Set PA5 to WP */

Maybe check first whether PA5 is already set correctly and avoid the
WRMSR and the restoring below too?

> +	wrmsr
> +	pop	%rdx			/* RDX contains original PAT value */
> +	pop	%rcx
> +
> +	movq	%rcx, %r9		/* Save kernel length */
> +	movq	%rdi, %r10		/* Save encrypted kernel address */
> +	movq	%rsi, %r11		/* Save decrypted kernel address */
> +
> +	wbinvd				/* Invalidate any cache entries */
> +
> +	/* Copy/encrypt 2MB at a time */
> +1:
> +	movq	%r11, %rsi		/* Source - decrypted kernel */
> +	movq	%r8, %rdi		/* Dest   - intermediate copy buffer */
> +	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
> +	rep	movsb

not movsQ?

> +	movq	%r8, %rsi		/* Source - intermediate copy buffer */
> +	movq	%r10, %rdi		/* Dest   - encrypted kernel */
> +	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
> +	rep	movsb
> +
> +	addq	$PMD_PAGE_SIZE, %r11
> +	addq	$PMD_PAGE_SIZE, %r10
> +	subq	$PMD_PAGE_SIZE, %r9	/* Kernel length decrement */
> +	jnz	1b			/* Kernel length not zero? */
> +
> +	/* Restore PAT register */
> +	push	%rdx			/* Save original PAT value */
> +	movl	$MSR_IA32_CR_PAT, %ecx
> +	rdmsr
> +	pop	%rdx			/* Restore original PAT value */
> +	wrmsr
> +
> +	ret
> +ENDPROC(sme_enc_routine)
> +.Lenc_stop:
> 

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.