On Mon, Jan 30, 2023, David Matlack wrote: > On Wed, Jan 25, 2023 at 5:49 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: > [...] > > --- > > arch/x86/kvm/mmu/tdp_mmu.c | 92 ++++++++++---------------------------- > > 1 file changed, 24 insertions(+), 68 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index bba33aea0fb0..2f78ca43a276 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > [...] > > @@ -1289,8 +1244,7 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, > > new_spte = mark_spte_for_access_track(new_spte); > > } > > > > - tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); > > - > > + kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level); > > This can race with fast_page_fault() setting the W-bit and the CPU > setting the D-bit. i.e. This call to kvm_tdp_mmu_write_spte() could > clear the W-bit or D-bit. Ugh, right. Hrm. Duh, I just didn't go far enough. A straight XCHG is silly. Except for the access-tracking mess, KVM wants to clear a single bit. Writing the entire thing and potentially clobbering bits is wasteful and unnecessarily dangerous. And the access-tracking code already needs special handling. We can just simplify this all by adding a helper to clear a single bit (and maybe even use clear_bit() and __clear_bit() if we save the bit number for the W/A/D bits and not just the mask). And if it weren't for EPT (different A/D locations), we could even use static asserts to restrict the usage to the W/A/D bits :-/ Oh well. E.g. this static inline void kvm_tdp_mmu_clear_spte_bit(struct tdp_iter *iter, u64 mask) { if (kvm_tdp_mmu_spte_has_volatile_bits(iter->old_spte, iter->level)) atomic64_and(~mask, (atomic64_t *)rcu_dereference(iter->sptep)); else __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte & ~mask); } yields another untested patch: --- arch/x86/kvm/mmu/tdp_iter.h | 24 +++++++++++++++++++----- arch/x86/kvm/mmu/tdp_mmu.c | 33 +++++++++++++-------------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index f0af385c56e0..09c8f2640ccc 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -29,11 +29,10 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) WRITE_ONCE(*rcu_dereference(sptep), new_spte); } -static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, - u64 new_spte, int level) +static inline bool kvm_tdp_mmu_spte_has_volatile_bits(u64 old_spte, int level) { /* - * Atomically write the SPTE if it is a shadow-present, leaf SPTE with + * Atomically write SPTEs if it is a shadow-present, leaf SPTE with * volatile bits, i.e. has bits that can be set outside of mmu_lock. * The Writable bit can be set by KVM's fast page fault handler, and * Accessed and Dirty bits can be set by the CPU. @@ -44,8 +43,15 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, * logic needs to be reassessed if KVM were to use non-leaf Accessed * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. */ - if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte)) + return is_shadow_present_pte(old_spte) && + is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte); +} + +static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, + u64 new_spte, int level) +{ + if (kvm_tdp_mmu_spte_has_volatile_bits(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); @@ -115,4 +121,12 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); +static inline void kvm_tdp_mmu_clear_spte_bit(struct tdp_iter *iter, u64 mask) +{ + if (kvm_tdp_mmu_spte_has_volatile_bits(iter->old_spte, iter->level)) + atomic64_and(~mask, (atomic64_t *)rcu_dereference(iter->sptep)); + else + __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte & ~mask); +} + #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 2f78ca43a276..32a7209a522d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1223,28 +1223,26 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { - u64 new_spte = 0; + u64 new_spte; /* If we have a non-accessed entry we don't need to change the pte. */ if (!is_accessed_spte(iter->old_spte)) return false; - new_spte = iter->old_spte; - - if (spte_ad_enabled(new_spte)) { - new_spte &= ~shadow_accessed_mask; + if (spte_ad_enabled(iter->old_spte)) { + kvm_tdp_mmu_clear_spte_bit(iter, shadow_accessed_mask); } else { + new_spte = mark_spte_for_access_track(iter->old_spte); + iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, + new_spte, iter->level); /* * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); - - new_spte = mark_spte_for_access_track(new_spte); + if (is_writable_pte(iter->old_spte)) + kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); } - kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level); return true; } @@ -1632,7 +1630,6 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { struct tdp_iter iter; - u64 new_spte; rcu_read_lock(); @@ -1648,20 +1645,16 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, mask &= ~(1UL << (iter.gfn - gfn)); if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else + if (!is_writable_pte(iter.old_spte)) continue; + + kvm_tdp_mmu_clear_spte_bit(&iter, PT_WRITABLE_MASK); } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else + if (!(iter.old_spte & shadow_dirty_mask)) continue; - kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); + kvm_tdp_mmu_clear_spte_bit(&iter, shadow_dirty_mask); } - - kvm_tdp_mmu_write_spte(iter.sptep, iter.old_spte, new_spte, iter.level); } rcu_read_unlock(); base-commit: 7bb67c88a2d77cc95524912f3b1ca51daf5c0224 --