Re: [PATCH v8 08/11] KVM: x86/mmu: Add infrastructure to allow walking rmaps outside of mmu_lock

James Houghton <jthoughton@xxxxxxxxxx> · Mon, 27 Jan 2025 13:42:32 -0800

On Fri, Jan 10, 2025 at 3:19 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
>
> On Tue, Nov 05, 2024, James Houghton wrote:
> > +static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
> > +{
> > +     unsigned long old_val, new_val;
> > +
> > +     /*
> > +      * Elide the lock if the rmap is empty, as lockless walkers (read-only
> > +      * mode) don't need to (and can't) walk an empty rmap, nor can they add
> > +      * entries to the rmap.  I.e. the only paths that process empty rmaps
> > +      * do so while holding mmu_lock for write, and are mutually exclusive.
> > +      */
> > +     old_val = atomic_long_read(&rmap_head->val);
> > +     if (!old_val)
> > +             return 0;
> > +
> > +     do {
> > +             /*
> > +              * If the rmap is locked, wait for it to be unlocked before
> > +              * trying acquire the lock, e.g. to bounce the cache line.
> > +              */
> > +             while (old_val & KVM_RMAP_LOCKED) {
> > +                     old_val = atomic_long_read(&rmap_head->val);
> > +                     cpu_relax();
> > +             }
>
> As Lai Jiangshan pointed out[1][2], this should PAUSE first, then re-read the SPTE,
> and KVM needs to disable preemption while holding the lock, because this is nothing
> more than a rudimentary spinlock.

Ah! Sorry for missing this.

>
> [1] https://lore.kernel.org/all/ZrooozABEWSnwzxh@xxxxxxxxxx
> [2] https://lore.kernel.org/all/Zrt5eNArfQA7x1qj@xxxxxxxxxx
>
> I think this?
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 1a0950b77126..9dac1bbb77d4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -873,6 +873,8 @@ static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
>  {
>         unsigned long old_val, new_val;
>
> +       lockdep_assert_preemption_disabled();
> +
>         /*
>          * Elide the lock if the rmap is empty, as lockless walkers (read-only
>          * mode) don't need to (and can't) walk an empty rmap, nor can they add
> @@ -889,8 +891,8 @@ static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
>                  * trying acquire the lock, e.g. to bounce the cache line.
>                  */
>                 while (old_val & KVM_RMAP_LOCKED) {
> -                       old_val = atomic_long_read(&rmap_head->val);
>                         cpu_relax();
> +                       old_val = atomic_long_read(&rmap_head->val);
>                 }
>
>                 /*
> @@ -931,6 +933,8 @@ static unsigned long kvm_rmap_lock(struct kvm *kvm,
>  static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
>                             unsigned long new_val)
>  {
> +       lockdep_assert_held_write(&kvm->mmu_lock);
> +
>         KVM_MMU_WARN_ON(new_val & KVM_RMAP_LOCKED);
>         /*
>          * Ensure that all accesses to the rmap have completed
> @@ -948,12 +952,21 @@ static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
>
>  /*
>   * If mmu_lock isn't held, rmaps can only locked in read-only mode.  The actual
> - * locking is the same, but the caller is disallowed from modifying the rmap,
> - * and so the unlock flow is a nop if the rmap is/was empty.
> + * locking is the same, but preemption needs to be manually disabled (because
> + * a spinlock isn't already held) and the caller is disallowed from modifying
> + * the rmap, and so the unlock flow is a nop if the rmap is/was empty.  Note,
> + * preemption must be disable *before* acquiring the bitlock.  If the rmap is
> + * empty, i.e. isn't truly locked, immediately re-enable preemption.
>   */
>  static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
>  {
> -       return __kvm_rmap_lock(rmap_head);
> +       unsigned rmap_val;
> +       preempt_disable();
> +
> +       rmap_val = __kvm_rmap_lock(rmap_head);
> +       if (!rmap_val)
> +               preempt_enable();
> +       return rmap_val;
>  }
>
>  static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
> @@ -964,6 +977,7 @@ static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
>
>         KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
>         atomic_long_set(&rmap_head->val, old_val);
> +       preempt_enable();
>  }
>
>  /*

I don't see anything wrong with these changes. Thanks! Applied.