[PATCH 4/4] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits.

Junaid Shahid <junaids@xxxxxxxxxx> · Wed, 26 Oct 2016 19:19:45 -0700

This change implements lockless access tracking for Intel CPUs without EPT
A bits. This is achieved by marking the PTEs as not-present (but not
completely clearing them) when clear_flush_young() is called after marking
the pages as accessed. When an EPT Violation is generated as a result of
the VM accessing those pages, the PTEs are restored to their original values.

Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx>
---
 arch/x86/include/asm/vmx.h |  39 ++++++
 arch/x86/kvm/mmu.c         | 314 ++++++++++++++++++++++++++++++++++-----------
 arch/x86/kvm/mmu.h         |   2 +
 arch/x86/kvm/vmx.c         |  20 ++-
 4 files changed, 301 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 60991fb..3d63098 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -434,6 +434,45 @@ enum vmcs_field {
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
 #define VMX_EPT_ACCESS_BIT				(1ull << 8)
 #define VMX_EPT_DIRTY_BIT				(1ull << 9)
+#define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
+						 VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK				(7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE		(VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+
+/*
+ * The shift to use for saving the original RWX value when marking the PTE as
+ * not-present for tracking purposes.
+ */
+#define VMX_EPT_RWX_SAVE_SHIFT			52
+
+/*
+ * The shift/mask for determining the type of tracking (if any) being used for a
+ * not-present PTE. Currently, only two bits are used, but more can be added.
+ *
+ * NOTE: Bit 63 is an architecturally ignored bit (and hence can be used for our
+ *       purpose) when the EPT PTE is in a misconfigured state. However, it is
+ *       not necessarily an ignored bit otherwise (even in a not-present state).
+ *       Since the existing MMIO code already uses this bit and since KVM
+ *       doesn't use #VEs currently (where this bit comes into play), so we can
+ *       continue to use it for storing the type. But to be on the safe side,
+ *       we should not set it to 1 in those TRACK_TYPEs where the tracking is
+ *       done via EPT Violations instead of EPT Misconfigurations.
+ */
+#define VMX_EPT_TRACK_TYPE_SHIFT		62
+#define VMX_EPT_TRACK_TYPE_MASK			(3ull <<                       \
+						 VMX_EPT_TRACK_TYPE_SHIFT)
+
+/* Sets only bit 62 as the tracking is done by EPT Violations. See note above */
+#define VMX_EPT_TRACK_ACCESS			(1ull <<                       \
+						 VMX_EPT_TRACK_TYPE_SHIFT)
+/* Sets bits 62 and 63. See note above */
+#define VMX_EPT_TRACK_MMIO			(3ull <<                       \
+						 VMX_EPT_TRACK_TYPE_SHIFT)
+
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a22a8a2..8ea1618 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/kern_levels.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -177,6 +178,10 @@ static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_acc_track_mask;
+static u64 __read_mostly shadow_acc_track_value;
+static u64 __read_mostly shadow_acc_track_saved_bits_mask;
+static u64 __read_mostly shadow_acc_track_saved_bits_shift;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -187,6 +192,26 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+void kvm_mmu_set_access_track_masks(u64 acc_track_mask, u64 acc_track_value,
+				    u64 saved_bits_mask, u64 saved_bits_shift)
+{
+	shadow_acc_track_mask = acc_track_mask;
+	shadow_acc_track_value = acc_track_value;
+	shadow_acc_track_saved_bits_mask = saved_bits_mask;
+	shadow_acc_track_saved_bits_shift = saved_bits_shift;
+
+	BUG_ON((~acc_track_mask & acc_track_value) != 0);
+	BUG_ON((~acc_track_mask & saved_bits_mask) != 0);
+	BUG_ON(shadow_accessed_mask != 0);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_access_track_masks);
+
+static inline bool is_access_track_spte(u64 spte)
+{
+	return shadow_acc_track_mask != 0 &&
+	       (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
 /*
  * the low bit of the generation number is always presumed to be zero.
  * This disables mmio caching during memslot updates.  The concept is
@@ -292,9 +317,25 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
+	BUG_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+void kvm_mmu_clear_all_pte_masks(void)
+{
+	shadow_user_mask = 0;
+	shadow_accessed_mask = 0;
+	shadow_dirty_mask = 0;
+	shadow_nx_mask = 0;
+	shadow_x_mask = 0;
+	shadow_mmio_mask = 0;
+	shadow_present_mask = 0;
+	shadow_acc_track_mask = 0;
+	shadow_acc_track_value = 0;
+	shadow_acc_track_saved_bits_mask = 0;
+	shadow_acc_track_saved_bits_shift = 0;
+}
+
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -307,7 +348,8 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+	return ((pte & 0xFFFFFFFFull) && !is_mmio_spte(pte)) ||
+	       is_access_track_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -490,6 +532,9 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (spte_can_locklessly_be_made_writable(spte))
 		return true;
 
+	if (is_access_track_spte(spte))
+		return true;
+
 	if (!shadow_accessed_mask)
 		return false;
 
@@ -533,17 +578,21 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
  * will find a read-only spte, even though the writable spte
  * might be cached on a CPU's TLB, the return value indicates this
  * case.
+ *
+ * Returns true if the TLB needs to be flushed
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool ret = false;
+	bool flush = false;
+	bool writable_cleared;
+	bool acc_track_enabled;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return ret;
+		return flush;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -551,24 +600,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
+	BUG_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
 	if (spte_can_locklessly_be_made_writable(old_spte) &&
-	      !is_writable_pte(new_spte))
-		ret = true;
-
-	if (!shadow_accessed_mask) {
-		/*
-		 * We don't set page dirty when dropping non-writable spte.
-		 * So do it now if the new spte is becoming non-writable.
-		 */
-		if (ret)
-			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-		return ret;
-	}
+	    !is_writable_pte(new_spte))
+		flush = true;
 
 	/*
 	 * Flush TLB when accessed/dirty bits are changed in the page tables,
@@ -576,20 +617,34 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 */
 	if (spte_is_bit_changed(old_spte, new_spte,
                                 shadow_accessed_mask | shadow_dirty_mask))
-		ret = true;
+		flush = true;
 
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+	writable_cleared = is_writable_pte(old_spte) &&
+			   !is_writable_pte(new_spte);
+	acc_track_enabled = !is_access_track_spte(old_spte) &&
+			    is_access_track_spte(new_spte);
+
+	if (writable_cleared || acc_track_enabled)
+		flush = true;
+
+	if (shadow_accessed_mask ?
+	    spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask) :
+	    acc_track_enabled)
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+
+	if (shadow_dirty_mask ?
+	    spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask) :
+	    writable_cleared)
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 
-	return ret;
+	return flush;
 }
 
 /*
  * Rules for using mmu_spte_clear_track_bits:
  * It sets the sptep from present to nonpresent, and track the
  * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
  */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
@@ -604,6 +659,13 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	if (!is_shadow_present_pte(old_spte))
 		return 0;
 
+	/*
+	 * For access tracking SPTEs, the pfn was already marked accessed/dirty
+	 * when the SPTE was marked for access tracking, so nothing to do here.
+	 */
+	if (is_access_track_spte(old_spte))
+		return 1;
+
 	pfn = spte_to_pfn(old_spte);
 
 	/*
@@ -618,6 +680,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
 					    PT_WRITABLE_MASK))
 		kvm_set_pfn_dirty(pfn);
+
 	return 1;
 }
 
@@ -636,6 +699,52 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 	return __get_spte_lockless(sptep);
 }
 
+static u64 mark_spte_for_access_track(u64 spte)
+{
+	if (shadow_acc_track_mask == 0)
+		return spte;
+
+	/*
+	 * Verify that the write-protection that we do below will be fixable
+	 * via the fast page fault path. Currently, that is always the case, at
+	 * least when using EPT (which is when access tracking would be used).
+	 */
+	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+		  !spte_can_locklessly_be_made_writable(spte),
+		  "Writable SPTE is not locklessly dirty-trackable\n");
+
+	/*
+	 * Any PTE marked for access tracking should also be marked for dirty
+	 * tracking (by being non-writable)
+	 */
+	spte &= ~PT_WRITABLE_MASK;
+
+	spte &= ~(shadow_acc_track_saved_bits_mask <<
+		  shadow_acc_track_saved_bits_shift);
+	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+		shadow_acc_track_saved_bits_shift;
+	spte &= ~shadow_acc_track_mask;
+	spte |= shadow_acc_track_value;
+
+	return spte;
+}
+
+/* Returns true if the TLB needs to be flushed */
+static bool mmu_spte_enable_access_track(u64 *sptep)
+{
+	u64 spte = mmu_spte_get_lockless(sptep);
+
+	if (is_access_track_spte(spte))
+		return false;
+
+	/* Access tracking should not be enabled if CPU supports A/D bits */
+	BUG_ON(shadow_accessed_mask != 0);
+
+	spte = mark_spte_for_access_track(spte);
+
+	return mmu_spte_update(sptep, spte);
+}
+
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1403,6 +1512,25 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	return kvm_zap_rmapp(kvm, rmap_head);
 }
 
+static int kvm_acc_track_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			       struct kvm_memory_slot *slot, gfn_t gfn,
+			       int level, unsigned long data)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	int need_tlb_flush = 0;
+
+	for_each_rmap_spte(rmap_head, &iter, sptep) {
+
+		rmap_printk("kvm_acc_track_rmapp: spte %p %llx gfn %llx (%d)\n",
+			    sptep, *sptep, gfn, level);
+
+		need_tlb_flush |= mmu_spte_enable_access_track(sptep);
+	}
+
+	return need_tlb_flush;
+}
+
 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
 			     unsigned long data)
@@ -1419,8 +1547,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 
 restart:
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
+
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+			    sptep, *sptep, gfn, level);
 
 		need_flush = 1;
 
@@ -1435,6 +1564,8 @@ restart:
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
 
+			new_spte = mark_spte_for_access_track(new_spte);
+
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
 		}
@@ -1615,24 +1746,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int young = 0;
-
-	/*
-	 * If there's no access bit in the secondary pte set by the
-	 * hardware it's up to gup-fast/gup to set the access bit in
-	 * the primary pte or in the page structure.
-	 */
-	if (!shadow_accessed_mask)
-		goto out;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			break;
-		}
+		if ((*sptep & shadow_accessed_mask) ||
+		    (!shadow_accessed_mask && !is_access_track_spte(*sptep)))
+			return 1;
 	}
-out:
-	return young;
+
+	return 0;
 }
 
 #define RMAP_RECYCLE_THRESHOLD 1000
@@ -1669,7 +1790,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 		 */
 		kvm->mmu_notifier_seq++;
 		return kvm_handle_hva_range(kvm, start, end, 0,
-					    kvm_unmap_rmapp);
+					    shadow_acc_track_mask != 0
+					    ? kvm_acc_track_rmapp
+					    : kvm_unmap_rmapp);
 	}
 
 	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
@@ -2591,6 +2714,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_dirty_mask;
 	}
 
+	if (speculative)
+		spte = mark_spte_for_access_track(spte);
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2644,7 +2770,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
-		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+		 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
 		 *sptep, sptep);
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
@@ -2877,16 +3003,27 @@ static bool page_fault_can_be_fast(u32 error_code)
 	if (unlikely(error_code & PFERR_RSVD_MASK))
 		return false;
 
-	/*
-	 * #PF can be fast only if the shadow page table is present and it
-	 * is caused by write-protect, that means we just need change the
-	 * W bit of the spte which can be done out of mmu-lock.
-	 */
-	if (!(error_code & PFERR_PRESENT_MASK) ||
-	      !(error_code & PFERR_WRITE_MASK))
+	/* See if the page fault is due to an NX violation */
+	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
 		return false;
 
-	return true;
+	/*
+	 * #PF can be fast if:
+	 * 1. The shadow page table entry is not present, which could mean that
+	 *    the fault is potentially caused by access tracking (if enabled).
+	 * 2. The shadow page table entry is present and the fault
+	 *    is caused by write-protect, that means we just need change the W
+	 *    bit of the spte which can be done out of mmu-lock.
+	 *
+	 * However, if Access Tracking is disabled, then the first condition
+	 * above cannot be handled by the fast path. So if access tracking is
+	 * disabled, we return true only if the second condition is met.
+	 */
+
+	return shadow_acc_track_mask != 0 ||
+	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
 }
 
 /*
@@ -2895,17 +3032,24 @@ static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 spte)
+			u64 *sptep, u64 old_spte,
+			bool remove_write_prot, bool remove_acc_track)
 {
 	gfn_t gfn;
+	u64 new_spte = old_spte;
 
 	WARN_ON(!sp->role.direct);
 
-	/*
-	 * The gfn of direct spte is stable since it is calculated
-	 * by sp->gfn.
-	 */
-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	if (remove_acc_track) {
+		u64 saved_bits = old_spte & (shadow_acc_track_saved_bits_mask <<
+					     shadow_acc_track_saved_bits_shift);
+
+		new_spte &= ~shadow_acc_track_mask;
+		new_spte |= saved_bits >> shadow_acc_track_saved_bits_shift;
+	}
+
+	if (remove_write_prot)
+		new_spte |= PT_WRITABLE_MASK;
 
 	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
@@ -2919,10 +3063,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
+	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (remove_write_prot) {
+		/*
+		 * The gfn of direct spte is stable since it is
+		 * calculated by sp->gfn.
+		 */
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
 
 	return true;
 }
@@ -2937,7 +3088,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	bool ret = false;
+	bool fault_handled = false;
 	u64 spte = 0ull;
 	uint retry_count = 0;
 
@@ -2953,36 +3104,43 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 			break;
 
 	do {
-		/*
-		 * If the mapping has been changed, let the vcpu fault on the
-		 * same address again.
-		 */
-		if (!is_shadow_present_pte(spte)) {
-			ret = true;
-			break;
-		}
+		bool remove_write_prot = (error_code & PFERR_WRITE_MASK) &&
+					 !(spte & PT_WRITABLE_MASK);
+		bool remove_acc_track;
+		bool valid_exec_access = (error_code & PFERR_FETCH_MASK) &&
+					 (spte & shadow_x_mask);
 
 		sp = page_header(__pa(iterator.sptep));
 		if (!is_last_spte(spte, sp->role.level))
 			break;
 
 		/*
-		 * Check if it is a spurious fault caused by TLB lazily flushed.
+		 * Check whether the memory access that caused the fault would
+		 * still cause it if it were to be performed right now. If not,
+		 * then this is a spurious fault caused by TLB lazily flushed,
+		 * or some other CPU has already fixed the PTE after the
+		 * current CPU took the fault.
+		 *
+		 * If Write-Only mappings ever become supported, then the
+		 * condition below would need to be changed appropriately.
 		 *
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_writable_pte(spte)) {
-			ret = true;
+		if (((spte & PT_PRESENT_MASK) && !remove_write_prot) ||
+		    valid_exec_access) {
+			fault_handled = true;
 			break;
 		}
 
+		remove_acc_track = is_access_track_spte(spte);
+
 		/*
-		 * Currently, to simplify the code, only the spte
-		 * write-protected by dirty-log can be fast fixed.
+		 * Currently, to simplify the code, write-protection can be
+		 * removed in the fast path only if the SPTE was write-protected
+		 * for dirty-logging.
 		 */
-		if (!spte_can_locklessly_be_made_writable(spte))
-			break;
+		remove_write_prot &= spte_can_locklessly_be_made_writable(spte);
 
 		/*
 		 * Do not fix write-permission on the large spte since we only
@@ -2998,13 +3156,20 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
 			break;
 
+		/* Verify that the fault can be handled in the fast path */
+		if (!remove_acc_track && !remove_write_prot)
+			break;
+
 		/*
 		 * Currently, fast page fault only works for direct mapping
 		 * since the gfn is not stable for indirect shadow page. See
 		 * Documentation/virtual/kvm/locking.txt to get more detail.
 		 */
-		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-		if (ret)
+		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+							iterator.sptep, spte,
+							remove_write_prot,
+							remove_acc_track);
+		if (fault_handled)
 			break;
 
 		if (++retry_count > 4) {
@@ -3018,10 +3183,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	} while (true);
 
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-			      spte, ret);
+			      spte, fault_handled);
 	walk_shadow_page_lockless_end(vcpu);
 
-	return ret;
+	return fault_handled;
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
@@ -4300,6 +4465,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
 }
 
+/* This is only supposed to be used for non-EPT mappings */
 static bool need_remote_flush(u64 old, u64 new)
 {
 	if (!is_shadow_present_pte(old))
@@ -5067,6 +5233,8 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
+	kvm_mmu_clear_all_pte_masks();
+
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ddc56e9..dfd3056 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,8 @@ static inline u64 rsvd_bits(int s, int e)
 }
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void kvm_mmu_set_access_track_masks(u64 acc_track_mask, u64 acc_track_value,
+				    u64 saved_bits_mask, u64 saved_bits_shift);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 859da8e..9cbfc56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5019,7 +5019,22 @@ static void ept_set_mmio_spte_mask(void)
 	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
 	 * spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE |
+				   VMX_EPT_TRACK_MMIO);
+}
+
+static void ept_set_acc_track_spte_mask(void)
+{
+	/*
+	 * For access track PTEs we use a non-present PTE to trigger an EPT
+	 * Violation. The original RWX value is saved in some unused bits in
+	 * the PTE and restored when the violation is fixed.
+	 */
+	kvm_mmu_set_access_track_masks(VMX_EPT_RWX_MASK |
+				       VMX_EPT_TRACK_TYPE_MASK,
+				       VMX_EPT_TRACK_ACCESS,
+				       VMX_EPT_RWX_MASK,
+				       VMX_EPT_RWX_SAVE_SHIFT);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
@@ -6549,6 +6564,9 @@ static __init int hardware_setup(void)
 				      0ull : VMX_EPT_READABLE_MASK);
 		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
+
+		if (!enable_ept_ad_bits)
+			ept_set_acc_track_spte_mask();
 	} else
 		kvm_disable_tdp();
 
-- 
2.8.0.rc3.226.g39d4020

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html