Add a lockless version of for_each_rmap_spte(), which is pretty much the same as the normal version, except that it doesn't BUG() the host if a non-present SPTE is encountered. When mmu_lock is held, it should be impossible for a different task to zap a SPTE, _and_ zapped SPTEs must be removed from their rmap chain prior to dropping mmu_lock. Thus, the normal walker BUG()s if a non-present SPTE is encountered as something is wildly broken. When walking rmaps without holding mmu_lock, the SPTEs pointed at by the rmap chain can be zapped/dropped, and so a lockless walk can observe a non-present SPTE if it runs concurrently with a different operation that is zapping SPTEs. Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> --- arch/x86/kvm/mmu/mmu.c | 86 +++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a683b5fc4026..48e8608c2738 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -932,10 +932,16 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu */ #define KVM_RMAP_LOCKED BIT(1) -static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) { unsigned long old_val, new_val; + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ old_val = READ_ONCE(rmap_head->val); if (!old_val) return 0; @@ -960,17 +966,53 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head) new_val = old_val | KVM_RMAP_LOCKED; } while (!try_cmpxchg(&rmap_head->val, &old_val, new_val)); - /* Return the old value, i.e. _without_ the LOCKED bit set. */ + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ return old_val; } +static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + /* + * TODO: Plumb in @kvm and add a lockdep assertion that mmu_lock is + * held for write. + */ + return __kvm_rmap_lock(rmap_head); +} + static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, unsigned long new_val) { - WARN_ON_ONCE(new_val & KVM_RMAP_LOCKED); + KVM_MMU_WARN_ON(new_val & KVM_RMAP_LOCKED); WRITE_ONCE(rmap_head->val, new_val); } +/* + * If mmu_lock isn't held, rmaps can only locked in read-only mode. The actual + * locking is the same, but the caller is disallowed from modifying the rmap, + * and so the unlock flow is a nop if the rmap is/was empty. + */ +__maybe_unused +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + return __kvm_rmap_lock(rmap_head); +} + +__maybe_unused +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != (rmap_head->val & ~KVM_RMAP_LOCKED)); + WRITE_ONCE(rmap_head->val, old_val); +} + + static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) { return READ_ONCE(rmap_head->val) & ~KVM_RMAP_LOCKED; @@ -1202,23 +1244,18 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { unsigned long rmap_val = kvm_rmap_get(rmap_head); - u64 *sptep; if (!rmap_val) return NULL; if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_val; - goto out; + return (u64 *)rmap_val; } iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1228,14 +1265,11 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1243,20 +1277,26 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!is_shadow_present_pte(*(_sptep_))) \ + BUG(); \ + else + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { -- 2.46.0.76.ge559c4bf1a-goog