Re: [Patch] KVM: x86/mmu: Make optimized __handle_changed_spte() for clear dirty log

Sean Christopherson <seanjc@xxxxxxxxxx> · Mon, 30 Jan 2023 23:49:47 +0000

On Mon, Jan 30, 2023, David Matlack wrote:
> On Wed, Jan 25, 2023 at 5:49 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
> [...]
> > ---
> >  arch/x86/kvm/mmu/tdp_mmu.c | 92 ++++++++++----------------------------
> >  1 file changed, 24 insertions(+), 68 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index bba33aea0fb0..2f78ca43a276 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> [...]
> > @@ -1289,8 +1244,7 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
> >                 new_spte = mark_spte_for_access_track(new_spte);
> >         }
> >
> > -       tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
> > -
> > +       kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level);
> 
> This can race with fast_page_fault() setting the W-bit and the CPU
> setting the D-bit. i.e. This call to kvm_tdp_mmu_write_spte() could
> clear the W-bit or D-bit.

Ugh, right.  Hrm.  Duh, I just didn't go far enough.  A straight XCHG is silly.
Except for the access-tracking mess, KVM wants to clear a single bit.  Writing
the entire thing and potentially clobbering bits is wasteful and unnecessarily
dangerous.  And the access-tracking code already needs special handling.

We can just simplify this all by adding a helper to clear a single bit (and
maybe even use clear_bit() and __clear_bit() if we save the bit number for the
W/A/D bits and not just the mask).  And if it weren't for EPT (different A/D
locations), we could even use static asserts to restrict the usage to the W/A/D
bits :-/  Oh well.

E.g. this

static inline void kvm_tdp_mmu_clear_spte_bit(struct tdp_iter *iter, u64 mask)
{
	if (kvm_tdp_mmu_spte_has_volatile_bits(iter->old_spte, iter->level))
		atomic64_and(~mask, (atomic64_t *)rcu_dereference(iter->sptep));
	else
		__kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte & ~mask);
}

yields another untested patch:

---
 arch/x86/kvm/mmu/tdp_iter.h | 24 +++++++++++++++++++-----
 arch/x86/kvm/mmu/tdp_mmu.c  | 33 +++++++++++++--------------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..09c8f2640ccc 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -29,11 +29,10 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
 	WRITE_ONCE(*rcu_dereference(sptep), new_spte);
 }
 
-static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
-					 u64 new_spte, int level)
+static inline bool kvm_tdp_mmu_spte_has_volatile_bits(u64 old_spte, int level)
 {
 	/*
-	 * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+	 * Atomically write SPTEs if it is a shadow-present, leaf SPTE with
 	 * volatile bits, i.e. has bits that can be set outside of mmu_lock.
 	 * The Writable bit can be set by KVM's fast page fault handler, and
 	 * Accessed and Dirty bits can be set by the CPU.
@@ -44,8 +43,15 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
 	 * logic needs to be reassessed if KVM were to use non-leaf Accessed
 	 * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
 	 */
-	if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
-	    spte_has_volatile_bits(old_spte))
+	return is_shadow_present_pte(old_spte) &&
+	       is_last_spte(old_spte, level) &&
+	       spte_has_volatile_bits(old_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+					 u64 new_spte, int level)
+{
+	if (kvm_tdp_mmu_spte_has_volatile_bits(old_spte, level))
 		return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
 
 	__kvm_tdp_mmu_write_spte(sptep, new_spte);
@@ -115,4 +121,12 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
 void tdp_iter_next(struct tdp_iter *iter);
 void tdp_iter_restart(struct tdp_iter *iter);
 
+static inline void kvm_tdp_mmu_clear_spte_bit(struct tdp_iter *iter, u64 mask)
+{
+	if (kvm_tdp_mmu_spte_has_volatile_bits(iter->old_spte, iter->level))
+		atomic64_and(~mask, (atomic64_t *)rcu_dereference(iter->sptep));
+	else
+		__kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte & ~mask);
+}
+
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2f78ca43a276..32a7209a522d 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1223,28 +1223,26 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
 			  struct kvm_gfn_range *range)
 {
-	u64 new_spte = 0;
+	u64 new_spte;
 
 	/* If we have a non-accessed entry we don't need to change the pte. */
 	if (!is_accessed_spte(iter->old_spte))
 		return false;
 
-	new_spte = iter->old_spte;
-
-	if (spte_ad_enabled(new_spte)) {
-		new_spte &= ~shadow_accessed_mask;
+	if (spte_ad_enabled(iter->old_spte)) {
+		kvm_tdp_mmu_clear_spte_bit(iter, shadow_accessed_mask);
 	} else {
+		new_spte = mark_spte_for_access_track(iter->old_spte);
+		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte,
+							new_spte, iter->level);
 		/*
 		 * Capture the dirty status of the page, so that it doesn't get
 		 * lost when the SPTE is marked for access tracking.
 		 */
-		if (is_writable_pte(new_spte))
-			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
-
-		new_spte = mark_spte_for_access_track(new_spte);
+		if (is_writable_pte(iter->old_spte))
+			kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
 	}
 
-	kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level);
 	return true;
 }
 
@@ -1632,7 +1630,6 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
 				  gfn_t gfn, unsigned long mask, bool wrprot)
 {
 	struct tdp_iter iter;
-	u64 new_spte;
 
 	rcu_read_lock();
 
@@ -1648,20 +1645,16 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
 		mask &= ~(1UL << (iter.gfn - gfn));
 
 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
-			if (is_writable_pte(iter.old_spte))
-				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
-			else
+			if (!is_writable_pte(iter.old_spte))
 				continue;
+
+			kvm_tdp_mmu_clear_spte_bit(&iter, PT_WRITABLE_MASK);
 		} else {
-			if (iter.old_spte & shadow_dirty_mask)
-				new_spte = iter.old_spte & ~shadow_dirty_mask;
-			else
+			if (!(iter.old_spte & shadow_dirty_mask))
 				continue;
 
-			kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
+			kvm_tdp_mmu_clear_spte_bit(&iter, shadow_dirty_mask);
 		}
-
-		kvm_tdp_mmu_write_spte(iter.sptep, iter.old_spte, new_spte, iter.level);
 	}
 
 	rcu_read_unlock();

base-commit: 7bb67c88a2d77cc95524912f3b1ca51daf5c0224
--