[PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry()

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Wed, 21 Aug 2024 21:28:10 +0100

From: David Woodhouse <dwmw@xxxxxxxxxxxx>

The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.

Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keep retrying
for ever, not making any progress.

Solve this by being a bit more selective about when to retry. In
addition to the ->needs_invalidation flag added in a previous commit,
check before calling hva_to_pfn() if there is any pending invalidation
(i.e. between invalidate_range_start() and invalidate_range_end()) which
affects the uHVA of the GPC being updated. This catches the case where
hva_to_pfn() would return a soon-to-be-stale mapping of a range for which
the invalidate_range_start() hook had already been called before the uHVA
was even set in the GPC and the ->needs_invalidation flag set.

An invalidation which *starts* after the lock is dropped in the loop is
fine because it will clear the ->needs_revalidation flag and also trigger
a retry.

This is slightly more complex than it needs to be; moving the
invalidation to occur in the invalidate_range_end() hook will make life
simpler, in a subsequent commit.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 20 ++++++++++++++
 virt/kvm/pfncache.c      | 60 +++++++++++++++++++++-------------------
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..1bfe2e8d52cd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,8 @@ struct kvm {
 	/* For management / invalidation of gfn_to_pfn_caches */
 	spinlock_t gpc_lock;
 	struct list_head gpc_list;
+	u64 mmu_gpc_invalidate_range_start;
+	u64 mmu_gpc_invalidate_range_end;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..84eb1ebb6f47 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	spin_lock(&kvm->mn_invalidate_lock);
 	kvm->mn_active_invalidate_count++;
+	if (likely(kvm->mn_active_invalidate_count == 1)) {
+		kvm->mmu_gpc_invalidate_range_start = range->start;
+		kvm->mmu_gpc_invalidate_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_gpc_invalidate_range_start =
+			min(kvm->mmu_gpc_invalidate_range_start, range->start);
+		kvm->mmu_gpc_invalidate_range_end =
+			max(kvm->mmu_gpc_invalidate_range_end, range->end);
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
@@ -1164,6 +1182,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
 	INIT_LIST_HEAD(&kvm->devices);
 	kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 7007d32d197a..eeb9bf43c04a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -129,32 +129,17 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 {
 	/*
-	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_invalidate_in_progress here; but the latter cannot
-	 * be used here because the invalidation of caches in the
-	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-	 * is elevated.
-	 *
-	 * Note, it does not matter that mn_active_invalidate_count
-	 * is not protected by gpc->lock.  It is guaranteed to
-	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_invalidate_seq is updated.
+	 * No need for locking on GPC here because these fields are protected
+	 * by gpc->refresh_lock.
 	 */
-	if (kvm->mn_active_invalidate_count)
-		return true;
+	guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
 
-	/*
-	 * Ensure mn_active_invalidate_count is read before
-	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end() to guarantee either the
-	 * old (non-zero) value of mn_active_invalidate_count or the
-	 * new (incremented) value of mmu_invalidate_seq is observed.
-	 */
-	smp_rmb();
-	return kvm->mmu_invalidate_seq != mmu_seq;
+	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -163,7 +148,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
 	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
 	void *new_khva = NULL;
-	unsigned long mmu_seq;
 
 	lockdep_assert_held(&gpc->refresh_lock);
 
@@ -177,9 +161,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = gpc->kvm->mmu_invalidate_seq;
-		smp_rmb();
-
 		/*
 		 * The translation made by hva_to_pfn() below could be made
 		 * invalid as soon as it's mapped. But the uhva is already
@@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 		write_unlock_irq(&gpc->lock);
 
+		/*
+		 * Invalidation occurs from the invalidate_range_start() hook,
+		 * which could already have happened before __kvm_gpc_refresh()
+		 * (or the previous turn around this loop) took gpc->lock().
+		 * If so, and if the corresponding invalidate_range_end() hook
+		 * hasn't happened yet, hva_to_pfn() could return a mapping
+		 * which is about to be stale and which should not be used. So
+		 * check if there are any currently-running invalidations which
+		 * affect the uHVA of this GPC, and retry if there are. Any
+		 * invalidation which starts after gpc->needs_invalidation is
+		 * set is fine, because it will clear that flag and trigger a
+		 * retry. And any invalidation which *completes* by having its
+		 * invalidate_range_end() hook called immediately prior to this
+		 * check is also fine, because the page tables are guaranteed
+		 * to have been changted already, so hva_to_pfn() won't return
+		 * a stale mapping in that case anyway.
+		 */
+		while (gpc_invalidations_pending(gpc)) {
+			cond_resched();
+			write_lock_irq(&gpc->lock);
+			continue;
+		}
+
 		/*
 		 * If the previous iteration "failed" due to an mmu_notifier
 		 * event, release the pfn and unmap the kernel virtual address
@@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+
 		write_lock_irq(&gpc->lock);
 
 		/*
@@ -240,8 +245,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (!gpc->needs_invalidation ||
-		 mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+	} while (!gpc->needs_invalidation);
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
-- 
2.44.0