Hi Boris,
On 03/17/2017 05:17 AM, Borislav Petkov wrote:
On Thu, Mar 16, 2017 at 11:25:36PM +0100, Paolo Bonzini wrote:
The kvmclock memory is initially zero so there is no need for the
hypervisor to allocate anything; the point of these patches is just to
access the data in a natural way from Linux source code.
I realize that.
I also don't really like the patch as is (plus it fails modpost), but
IMO reusing __change_page_attr and __split_large_page is the right thing
to do.
Right, so teaching pageattr.c about memblock could theoretically come
around and bite us later when a page allocated with memblock gets freed
with free_page().
And looking at this more, we have all this kernel pagetable preparation
code down the init_mem_mapping() call and the pagetable setup in
arch/x86/mm/init_{32,64}.c
And that code even does some basic page splitting. Oh and it uses
alloc_low_pages() which knows whether to do memblock reservation or the
common __get_free_pages() when slabs are up.
I looked into arch/x86/mm/init_{32,64}.c and as you pointed the file contains
routines to do basic page splitting. I think it sufficient for our usage.
I should be able to drop the memblock patch from the series and update the
Patch 15 [1] to use the kernel_physical_mapping_init().
The kernel_physical_mapping_init() creates the page table mapping using
default KERNEL_PAGE attributes, I tried to extend the function by passing
'bool enc' flags to hint whether to clr or set _PAGE_ENC when splitting the
pages. The code did not looked clean hence I dropped that idea. Instead,
I took the below approach. I did some runtime test and it seem to be working okay.
[1] http://marc.info/?l=linux-mm&m=148846773731212&w=2
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 7df5f4c..de16ef4 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -22,6 +23,8 @@
#include <asm/bootparam.h>
#include <asm/cacheflush.h>
+#include "mm_internal.h"
+
extern pmdval_t early_pmd_flags;
int __init __early_make_pgtable(unsigned long, pmdval_t);
void __init __early_pgtable_flush(void);
@@ -258,6 +261,72 @@ static void sme_free(struct device *dev, size_t size, void *vaddr,
swiotlb_free_coherent(dev, size, vaddr, dma_handle);
}
+static int __init early_set_memory_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ pte_t *kpte;
+ int level;
+ unsigned long vaddr, vaddr_end, vaddr_next;
+
+ vaddr = (unsigned long)__va(paddr);
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ /*
+ * We are going to change the physical page attribute from C=1 to C=0.
+ * Flush the caches to ensure that all the data with C=1 is flushed to
+ * memory. Any caching of the vaddr after function returns will
+ * use C=0.
+ */
+ clflush_cache_range(__va(paddr), size);
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte) )
+ return 1;
+
+ if (level == PG_LEVEL_4K) {
+ pte_t new_pte;
+ unsigned long pfn = pte_pfn(*kpte);
+ pgprot_t new_prot = pte_pgprot(*kpte);
+
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+ pr_info(" pte %016lx -> 0x%016lx\n", pte_val(*kpte),
+ pte_val(new_pte));
+ set_pte_atomic(kpte, new_pte);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ /*
+ * virtual address is part of large page, create the page
+ * table mapping to use smaller pages (4K). The virtual and
+ * physical address must be aligned to PMD level.
+ */
+ kernel_physical_mapping_init(__pa(vaddr & PMD_MASK),
+ __pa((vaddr_end & PMD_MASK) + PMD_SIZE),
+ 0);
+ }
+
+ __flush_tlb_all();
+ return 0;
+}
+
+int __init early_set_memory_decrypted(resource_size_t paddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(paddr, size, false);
+}
+
+int __init early_set_memory_encrypted(resource_size_t paddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(paddr, size, true);
+}
+
So what would be much cleaner, IMHO, is if one would reuse that code to
change init_mm.pgd mappings early without copying pageattr.c.
init_mem_mapping() gets called before kvm_guest_init() in setup_arch()
so the guest would simply fixup its pagetable right there.