On Tue, Nov 05, 2024, James Houghton wrote: > +static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head) > +{ > + unsigned long old_val, new_val; > + > + /* > + * Elide the lock if the rmap is empty, as lockless walkers (read-only > + * mode) don't need to (and can't) walk an empty rmap, nor can they add > + * entries to the rmap. I.e. the only paths that process empty rmaps > + * do so while holding mmu_lock for write, and are mutually exclusive. > + */ > + old_val = atomic_long_read(&rmap_head->val); > + if (!old_val) > + return 0; > + > + do { > + /* > + * If the rmap is locked, wait for it to be unlocked before > + * trying acquire the lock, e.g. to bounce the cache line. > + */ > + while (old_val & KVM_RMAP_LOCKED) { > + old_val = atomic_long_read(&rmap_head->val); > + cpu_relax(); > + } As Lai Jiangshan pointed out[1][2], this should PAUSE first, then re-read the SPTE, and KVM needs to disable preemption while holding the lock, because this is nothing more than a rudimentary spinlock. [1] https://lore.kernel.org/all/ZrooozABEWSnwzxh@xxxxxxxxxx [2] https://lore.kernel.org/all/Zrt5eNArfQA7x1qj@xxxxxxxxxx I think this? diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1a0950b77126..9dac1bbb77d4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -873,6 +873,8 @@ static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) { unsigned long old_val, new_val; + lockdep_assert_preemption_disabled(); + /* * Elide the lock if the rmap is empty, as lockless walkers (read-only * mode) don't need to (and can't) walk an empty rmap, nor can they add @@ -889,8 +891,8 @@ static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) * trying acquire the lock, e.g. to bounce the cache line. */ while (old_val & KVM_RMAP_LOCKED) { - old_val = atomic_long_read(&rmap_head->val); cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); } /* @@ -931,6 +933,8 @@ static unsigned long kvm_rmap_lock(struct kvm *kvm, static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, unsigned long new_val) { + lockdep_assert_held_write(&kvm->mmu_lock); + KVM_MMU_WARN_ON(new_val & KVM_RMAP_LOCKED); /* * Ensure that all accesses to the rmap have completed @@ -948,12 +952,21 @@ static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) /* * If mmu_lock isn't held, rmaps can only locked in read-only mode. The actual - * locking is the same, but the caller is disallowed from modifying the rmap, - * and so the unlock flow is a nop if the rmap is/was empty. + * locking is the same, but preemption needs to be manually disabled (because + * a spinlock isn't already held) and the caller is disallowed from modifying + * the rmap, and so the unlock flow is a nop if the rmap is/was empty. Note, + * preemption must be disable *before* acquiring the bitlock. If the rmap is + * empty, i.e. isn't truly locked, immediately re-enable preemption. */ static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) { - return __kvm_rmap_lock(rmap_head); + unsigned rmap_val; + preempt_disable(); + + rmap_val = __kvm_rmap_lock(rmap_head); + if (!rmap_val) + preempt_enable(); + return rmap_val; } static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, @@ -964,6 +977,7 @@ static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); atomic_long_set(&rmap_head->val, old_val); + preempt_enable(); } /*