[RFC PATCH 22/28] kvm: mmu: Implement access tracking for the direct MMU

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Adds functions for dealing with the accessed state of PTEs which
can operate with the direct MMU.

Signed-off-by: Ben Gardon <bgardon@xxxxxxxxxx>
---
 arch/x86/kvm/mmu.c  | 153 +++++++++++++++++++++++++++++++++++++++++---
 virt/kvm/kvm_main.c |   7 +-
 2 files changed, 150 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b144c803c36d2..cc81ba5ee46d6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -779,6 +779,17 @@ static bool spte_has_volatile_bits(u64 spte)
 	return false;
 }
 
+static bool is_accessed_direct_pte(u64 pte, int level)
+{
+	if (!is_last_spte(pte, level))
+		return false;
+
+	if (shadow_accessed_mask)
+		return pte & shadow_accessed_mask;
+
+	return pte & shadow_acc_track_mask;
+}
+
 static bool is_accessed_spte(u64 spte)
 {
 	u64 accessed_mask = spte_shadow_accessed_mask(spte);
@@ -929,6 +940,14 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 	return __get_spte_lockless(sptep);
 }
 
+static u64 save_pte_permissions_for_access_track(u64 pte)
+{
+	pte |= (pte & shadow_acc_track_saved_bits_mask) <<
+		shadow_acc_track_saved_bits_shift;
+	pte &= ~shadow_acc_track_mask;
+	return pte;
+}
+
 static u64 mark_spte_for_access_track(u64 spte)
 {
 	if (spte_ad_enabled(spte))
@@ -944,16 +963,13 @@ static u64 mark_spte_for_access_track(u64 spte)
 	 */
 	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 		  !spte_can_locklessly_be_made_writable(spte),
-		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+		  "kvm: Writable PTE is not locklessly dirty-trackable\n");
 
 	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
 			  shadow_acc_track_saved_bits_shift),
 		  "kvm: Access Tracking saved bit locations are not zero\n");
 
-	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
-		shadow_acc_track_saved_bits_shift;
-	spte &= ~shadow_acc_track_mask;
-
+	spte = save_pte_permissions_for_access_track(spte);
 	return spte;
 }
 
@@ -1718,6 +1734,15 @@ static void free_pt_rcu_callback(struct rcu_head *rp)
 	free_page((unsigned long)disconnected_pt);
 }
 
+static void handle_changed_pte_acc_track(u64 old_pte, u64 new_pte, int level)
+{
+	bool pfn_changed = spte_to_pfn(old_pte) != spte_to_pfn(new_pte);
+
+	if (is_accessed_direct_pte(old_pte, level) &&
+	    (!is_accessed_direct_pte(new_pte, level) || pfn_changed))
+		kvm_set_pfn_accessed(spte_to_pfn(old_pte));
+}
+
 /*
  * Takes a snapshot of, and clears, the direct MMU disconnected pt list. Once
  * TLBs have been flushed, this snapshot can be transferred to the direct MMU
@@ -1847,6 +1872,7 @@ static void mark_pte_disconnected(struct kvm *kvm, int as_id, gfn_t gfn,
 
 	handle_changed_pte(kvm, as_id, gfn, old_pte, DISCONNECTED_PTE, level,
 			   vm_teardown, disconnected_pts);
+	handle_changed_pte_acc_track(old_pte, DISCONNECTED_PTE, level);
 }
 
 /**
@@ -2412,8 +2438,8 @@ static bool cmpxchg_pte(u64 *ptep, u64 old_pte, u64 new_pte, int level, u64 gfn)
 	return r == old_pte;
 }
 
-static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
-					 u64 new_pte)
+static bool direct_walk_iterator_set_pte_raw(struct direct_walk_iterator *iter,
+					 u64 new_pte, bool handle_acc_track)
 {
 	bool r;
 
@@ -2435,6 +2461,10 @@ static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
 				   iter->old_pte, new_pte, iter->level, false,
 				   &iter->disconnected_pts);
 
+		if (handle_acc_track)
+			handle_changed_pte_acc_track(iter->old_pte, new_pte,
+						     iter->level);
+
 		if (iter->lock_mode & (MMU_WRITE_LOCK | MMU_READ_LOCK))
 			iter->tlbs_dirty++;
 	} else
@@ -2443,6 +2473,18 @@ static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
 	return r;
 }
 
+static bool direct_walk_iterator_set_pte_no_acc_track(
+		struct direct_walk_iterator *iter, u64 new_pte)
+{
+	return direct_walk_iterator_set_pte_raw(iter, new_pte, false);
+}
+
+static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
+					 u64 new_pte)
+{
+	return direct_walk_iterator_set_pte_raw(iter, new_pte, true);
+}
+
 static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
 {
 	u64 pte;
@@ -2965,14 +3007,107 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 			KVM_PAGES_PER_HPAGE(sp->role.level));
 }
 
+static int age_direct_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
+				 gfn_t start, gfn_t end, unsigned long ignored)
+{
+	struct direct_walk_iterator iter;
+	int young = 0;
+	u64 new_pte = 0;
+
+	direct_walk_iterator_setup_walk(&iter, kvm, slot->as_id, start, end,
+					MMU_WRITE_LOCK);
+	while (direct_walk_iterator_next_present_leaf_pte(&iter)) {
+		/*
+		 * If we have a non-accessed entry we don't need to change the
+		 * pte.
+		 */
+		if (!is_accessed_direct_pte(iter.old_pte, iter.level))
+			continue;
+
+		if (shadow_accessed_mask)
+			new_pte = iter.old_pte & ~shadow_accessed_mask;
+		else {
+			new_pte = save_pte_permissions_for_access_track(
+					iter.old_pte);
+			new_pte |= shadow_acc_track_value;
+		}
+
+		/*
+		 * We've created a new pte with the accessed state cleared.
+		 * Warn if we're about to put in a pte that still looks
+		 * accessed.
+		 */
+		WARN_ON(is_accessed_direct_pte(new_pte, iter.level));
+
+		if (!direct_walk_iterator_set_pte_no_acc_track(&iter, new_pte))
+			continue;
+
+		young = true;
+
+		if (shadow_accessed_mask)
+			trace_kvm_age_page(iter.pte_gfn_start, iter.level, slot,
+					   young);
+	}
+	direct_walk_iterator_end_traversal(&iter);
+
+	return young;
+}
+
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
+	int young = 0;
+
+	if (kvm->arch.direct_mmu_enabled)
+		young |= kvm_handle_direct_hva_range(kvm, start, end, 0,
+						     age_direct_gfn_range);
+
+	if (!kvm->arch.pure_direct_mmu)
+		young |= kvm_handle_hva_range(kvm, start, end, 0,
+					      kvm_age_rmapp);
+	return young;
+}
+
+static int test_age_direct_gfn_range(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t start, gfn_t end,
+				     unsigned long ignored)
+{
+	struct direct_walk_iterator iter;
+	int young = 0;
+
+	direct_walk_iterator_setup_walk(&iter, kvm, slot->as_id, start, end,
+					MMU_WRITE_LOCK);
+	while (direct_walk_iterator_next_present_leaf_pte(&iter)) {
+		if (is_accessed_direct_pte(iter.old_pte, iter.level)) {
+			young = true;
+			break;
+		}
+	}
+	direct_walk_iterator_end_traversal(&iter);
+
+	return young;
 }
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+	int young = 0;
+
+	/*
+	 * If there's no access bit in the secondary pte set by the
+	 * hardware it's up to gup-fast/gup to set the access bit in
+	 * the primary pte or in the page structure.
+	 */
+	if (!shadow_accessed_mask)
+		return young;
+
+	if (kvm->arch.direct_mmu_enabled)
+		young |= kvm_handle_direct_hva_range(kvm, hva, hva + 1, 0,
+						     test_age_direct_gfn_range);
+
+	if (!kvm->arch.pure_direct_mmu)
+		young |= kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+
+	return young;
 }
 
 #ifdef MMU_DEBUG
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d494044104270..771e159d6bea9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -439,7 +439,12 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	write_lock(&kvm->mmu_lock);
 
 	young = kvm_age_hva(kvm, start, end);
-	if (young)
+
+	/*
+	 * If there was an accessed page in the provided range, or there are
+	 * un-flushed paging structure changes, flush the TLBs.
+	 */
+	if (young || kvm->tlbs_dirty)
 		kvm_flush_remote_tlbs(kvm);
 
 	write_unlock(&kvm->mmu_lock);
-- 
2.23.0.444.g18eeb5a265-goog




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux