[PATCH 04/10] nEPT: Fix page table format in nested EPT

"Nadav Har'El" <nyh@xxxxxxxxxx> · Thu, 10 Nov 2011 11:59:54 +0200

When the existing KVM MMU code creates a shadow page table, it assumes it
has the normal x86 page table format. This is obviously correct for normal
shadow page tables, and also correct for AMD's NPT.
Unfortunately, Intel's EPT page tables differ in subtle ways from ordinary
page tables, so when we create a shadow EPT table (i.e., in nested EPT),
we need to slightly modify the way in which this table table is built.

In particular, when mmu.c's link_shadow_page() creates non-leaf page table
entries, it used to enable the "present", "accessed", "writable" and "user"
flags on these entries. While this is correct for ordinary page tables, it
is wrong in EPT tables - where these bits actually have completely different
meaning (compare PT_*_MASK from mmu.h to VMX_EPT_*_MASK from vmx.h).
In particular, leaving the code as-is causes bit 5 of the PTE to be turned on
(supposedly for PT_ACCESSED_MASK), which is a reserved bit in EPT and causes
an "EPT Misconfiguration" failure.

So we must move link_shadow_page's list of extra bits to a new mmu context
field, which is set differently for nested EPT.

Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/mmu.c              |   16 +++++++++-------
 arch/x86/kvm/paging_tmpl.h      |    6 ++++--
 arch/x86/kvm/vmx.c              |    3 +++
 4 files changed, 17 insertions(+), 9 deletions(-)

--- .before/arch/x86/include/asm/kvm_host.h	2011-11-10 11:33:59.000000000 +0200
+++ .after/arch/x86/include/asm/kvm_host.h	2011-11-10 11:33:59.000000000 +0200
@@ -287,6 +287,7 @@ struct kvm_mmu {
 	bool nx;
 
 	u64 pdptrs[4]; /* pae */
+	u64 link_shadow_page_set_bits;
 };
 
 struct kvm_vcpu_arch {
--- .before/arch/x86/kvm/vmx.c	2011-11-10 11:33:59.000000000 +0200
+++ .after/arch/x86/kvm/vmx.c	2011-11-10 11:33:59.000000000 +0200
@@ -6485,6 +6485,9 @@ static int nested_ept_init_mmu_context(s
 	vcpu->arch.mmu.get_pdptr         = nested_ept_get_pdptr;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 	vcpu->arch.mmu.shadow_root_level = get_ept_level();
+	vcpu->arch.mmu.link_shadow_page_set_bits =
+		VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+		VMX_EPT_EXECUTABLE_MASK;
 
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 
--- .before/arch/x86/kvm/mmu.c	2011-11-10 11:33:59.000000000 +0200
+++ .after/arch/x86/kvm/mmu.c	2011-11-10 11:33:59.000000000 +0200
@@ -1782,14 +1782,9 @@ static void shadow_walk_next(struct kvm_
 	return __shadow_walk_next(iterator, *iterator->sptep);
 }
 
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, u64 set_bits)
 {
-	u64 spte;
-
-	spte = __pa(sp->spt)
-		| PT_PRESENT_MASK | PT_ACCESSED_MASK
-		| PT_WRITABLE_MASK | PT_USER_MASK;
-	mmu_spte_set(sptep, spte);
+	mmu_spte_set(sptep, __pa(sp->spt) | set_bits);
 }
 
 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -3366,6 +3361,13 @@ int kvm_init_shadow_mmu(struct kvm_vcpu 
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.base_role.smep_andnot_wp
 		= smep && !is_write_protection(vcpu);
+	/*
+	 * link_shadow() should apply these bits in shadow page tables, and
+	 * in shadow NPT tables (nested NPT). For nested EPT, different bits
+	 * apply.
+	 */
+	vcpu->arch.mmu.link_shadow_page_set_bits = PT_PRESENT_MASK |
+		PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
 	return r;
 }
--- .before/arch/x86/kvm/paging_tmpl.h	2011-11-10 11:33:59.000000000 +0200
+++ .after/arch/x86/kvm/paging_tmpl.h	2011-11-10 11:33:59.000000000 +0200
@@ -515,7 +515,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 			goto out_gpte_changed;
 
 		if (sp)
-			link_shadow_page(it.sptep, sp);
+			link_shadow_page(it.sptep, sp,
+				vcpu->arch.mmu.link_shadow_page_set_bits);
 	}
 
 	for (;
@@ -535,7 +536,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 
 		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
 				      true, direct_access, it.sptep);
-		link_shadow_page(it.sptep, sp);
+		link_shadow_page(it.sptep, sp,
+			vcpu->arch.mmu.link_shadow_page_set_bits);
 	}
 
 	clear_sp_write_flooding_count(it.sptep);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html