On Fri, Aug 9, 2024 at 12:44 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: > > Steal another bit from rmap entries (which are word aligned pointers, i.e. > have 2 free bits on 32-bit KVM, and 3 free bits on 64-bit KVM), and use > the bit to implement a *very* rudimentary per-rmap spinlock. The only > anticipated usage of the lock outside of mmu_lock is for aging gfns, and > collisions between aging and other MMU rmap operations are quite rare, > e.g. unless userspace is being silly and aging a tiny range over and over > in a tight loop, time between contention when aging an actively running VM > is O(seconds). In short, a more sophisticated locking scheme shouldn't be > necessary. > > Note, the lock only protects the rmap structure itself, SPTEs that are > pointed at by a locked rmap can still be modified and zapped by another > task (KVM drops/zaps SPTEs before deleting the rmap entries) > > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> > --- > arch/x86/kvm/mmu/mmu.c | 80 +++++++++++++++++++++++++++++++++++++----- > 1 file changed, 71 insertions(+), 9 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 8ca7f51c2da3..a683b5fc4026 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -909,11 +909,73 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu > * About rmap_head encoding: > * > * If the bit zero of rmap_head->val is clear, then it points to the only spte > - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct > + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct > * pte_list_desc containing more mappings. > */ > #define KVM_RMAP_MANY BIT(0) > > +/* > + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always > + * operates with mmu_lock held for write), but rmaps can be walked without > + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain > + * being zapped/dropped _while the rmap is locked_. > + * > + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be > + * done while holding mmu_lock for write. This allows a task walking rmaps > + * without holding mmu_lock to concurrently walk the same entries as a task > + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify > + * the rmaps, thus the walks are stable. > + * > + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, > + * only the rmap chains themselves are protected. E.g. holding an rmap's lock > + * ensures all "struct pte_list_desc" fields are stable. > + */ > +#define KVM_RMAP_LOCKED BIT(1) > + > +static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head) > +{ > + unsigned long old_val, new_val; > + > + old_val = READ_ONCE(rmap_head->val); > + if (!old_val) > + return 0; I'm having trouble understanding how this bit works. What exactly is stopping the rmap from being populated while we have it "locked"? We aren't holding the MMU lock at all in the lockless case, and given this bit, it is impossible (I think?) for the MMU-write-lock-holding, rmap-modifying side to tell that this rmap is locked. Concretely, my immediate concern is that we will still unconditionally write 0 back at unlock time even if the value has changed. I expect that this works and I'm just not getting it... :) > + > + do { > + /* > + * If the rmap is locked, wait for it to be unlocked before > + * trying acquire the lock, e.g. to bounce the cache line. > + */ > + while (old_val & KVM_RMAP_LOCKED) { > + old_val = READ_ONCE(rmap_head->val); > + cpu_relax(); > + } > + > + /* > + * Recheck for an empty rmap, it may have been purged by the > + * task that held the lock. > + */ > + if (!old_val) > + return 0; > + > + new_val = old_val | KVM_RMAP_LOCKED; > + } while (!try_cmpxchg(&rmap_head->val, &old_val, new_val)); > + > + /* Return the old value, i.e. _without_ the LOCKED bit set. */ > + return old_val; > +} > + > +static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, > + unsigned long new_val) > +{ > + WARN_ON_ONCE(new_val & KVM_RMAP_LOCKED); > + WRITE_ONCE(rmap_head->val, new_val); > +} > + > +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) > +{ > + return READ_ONCE(rmap_head->val) & ~KVM_RMAP_LOCKED; > +} > + > /* > * Returns the number of pointers in the rmap chain, not counting the new one. > */ > @@ -924,7 +986,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, > struct pte_list_desc *desc; > int count = 0; > > - old_val = rmap_head->val; > + old_val = kvm_rmap_lock(rmap_head); > > if (!old_val) { > new_val = (unsigned long)spte; > @@ -956,7 +1018,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, > desc->sptes[desc->spte_count++] = spte; > } > > - rmap_head->val = new_val; > + kvm_rmap_unlock(rmap_head, new_val); > > return count; > } > @@ -1004,7 +1066,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, > unsigned long rmap_val; > int i; > > - rmap_val = rmap_head->val; > + rmap_val = kvm_rmap_lock(rmap_head); > if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) > goto out; > > @@ -1030,7 +1092,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, > } > > out: > - rmap_head->val = rmap_val; > + kvm_rmap_unlock(rmap_head, rmap_val); > } > > static void kvm_zap_one_rmap_spte(struct kvm *kvm, > @@ -1048,7 +1110,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, > unsigned long rmap_val; > int i; > > - rmap_val = rmap_head->val; > + rmap_val = kvm_rmap_lock(rmap_head); > if (!rmap_val) > return false; > > @@ -1067,13 +1129,13 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, > } > out: > /* rmap_head is meaningless now, remember to reset it */ > - rmap_head->val = 0; > + kvm_rmap_unlock(rmap_head, 0); > return true; > } > > unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) > { > - unsigned long rmap_val = rmap_head->val; > + unsigned long rmap_val = kvm_rmap_get(rmap_head); > struct pte_list_desc *desc; > > if (!rmap_val) > @@ -1139,7 +1201,7 @@ struct rmap_iterator { > static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, > struct rmap_iterator *iter) > { > - unsigned long rmap_val = rmap_head->val; > + unsigned long rmap_val = kvm_rmap_get(rmap_head); > u64 *sptep; > > if (!rmap_val) > -- > 2.46.0.76.ge559c4bf1a-goog >