Re: [RFC PATCH v4 13/14] KVM: arm64: Handle guest_memfd()-backed guest page faults

Fuad Tabba <tabba@xxxxxxxxxx> · Thu, 16 Jan 2025 15:16:41 +0000

Hi Patrick,

On Thu, 16 Jan 2025 at 14:48, Patrick Roy <roypat@xxxxxxxxxxxx> wrote:
>
> On Fri, 2024-12-13 at 16:48 +0000, Fuad Tabba wrote:
> > Add arm64 support for resolving guest page faults on
> > guest_memfd() backed memslots. This support is not contingent on
> > pKVM, or other confidential computing support, and works in both
> > VHE and nVHE modes.
> >
> > Without confidential computing, this support is useful forQ
> > testing and debugging. In the future, it might also be useful
> > should a user want to use guest_memfd() for all code, whether
> > it's for a protected guest or not.
> >
> > For now, the fault granule is restricted to PAGE_SIZE.
> >
> > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> > ---
> >  arch/arm64/kvm/mmu.c | 111 ++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 109 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 342a9bd3848f..1c4b3871967c 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1434,6 +1434,107 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> >         return vma->vm_flags & VM_MTE_ALLOWED;
> >  }
> >
> > +static int guest_memfd_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > +                            struct kvm_memory_slot *memslot, bool fault_is_perm)
> > +{
> > +       struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> > +       bool exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> > +       bool logging_active = memslot_is_logging(memslot);
> > +       struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
> > +       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> > +       bool write_fault = kvm_is_write_fault(vcpu);
> > +       struct mm_struct *mm = current->mm;
> > +       gfn_t gfn = gpa_to_gfn(fault_ipa);
> > +       struct kvm *kvm = vcpu->kvm;
> > +       struct page *page;
> > +       kvm_pfn_t pfn;
> > +       int ret;
> > +
> > +       /* For now, guest_memfd() only supports PAGE_SIZE granules. */
> > +       if (WARN_ON_ONCE(fault_is_perm &&
> > +                        kvm_vcpu_trap_get_perm_fault_granule(vcpu) != PAGE_SIZE)) {
> > +               return -EFAULT;
> > +       }
> > +
> > +       VM_BUG_ON(write_fault && exec_fault);
> > +
> > +       if (fault_is_perm && !write_fault && !exec_fault) {
> > +               kvm_err("Unexpected L2 read permission error\n");
> > +               return -EFAULT;
> > +       }
> > +
> > +       /*
> > +        * Permission faults just need to update the existing leaf entry,
> > +        * and so normally don't require allocations from the memcache. The
> > +        * only exception to this is when dirty logging is enabled at runtime
> > +        * and a write fault needs to collapse a block entry into a table.
> > +        */
> > +       if (!fault_is_perm || (logging_active && write_fault)) {
> > +               ret = kvm_mmu_topup_memory_cache(memcache,
> > +                                                kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
> > +               if (ret)
> > +                       return ret;
> > +       }
> > +
> > +       /*
> > +        * Holds the folio lock until mapped in the guest and its refcount is
> > +        * stable, to avoid races with paths that check if the folio is mapped
> > +        * by the host.
> > +        */
> > +       ret = kvm_gmem_get_pfn_locked(kvm, memslot, gfn, &pfn, &page, NULL);
> > +       if (ret)
> > +               return ret;
> > +
> > +       if (!kvm_slot_gmem_is_guest_mappable(memslot, gfn)) {
> > +               ret = -EAGAIN;
> > +               goto unlock_page;
> > +       }
> > +
> > +       /*
> > +        * Once it's faulted in, a guest_memfd() page will stay in memory.
> > +        * Therefore, count it as locked.
> > +        */
> > +       if (!fault_is_perm) {
> > +               ret = account_locked_vm(mm, 1, true);
> > +               if (ret)
> > +                       goto unlock_page;
> > +       }
> > +
> > +       read_lock(&kvm->mmu_lock);
> > +       if (write_fault)
> > +               prot |= KVM_PGTABLE_PROT_W;
> > +
> > +       if (exec_fault)
> > +               prot |= KVM_PGTABLE_PROT_X;
> > +
> > +       if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
> > +               prot |= KVM_PGTABLE_PROT_X;
> > +
> > +       /*
> > +        * Under the premise of getting a FSC_PERM fault, we just need to relax
> > +        * permissions.
> > +        */
> > +       if (fault_is_perm)
> > +               ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
> > +       else
> > +               ret = kvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
> > +                                       __pfn_to_phys(pfn), prot,
> > +                                       memcache,
> > +                                       KVM_PGTABLE_WALK_HANDLE_FAULT |
> > +                                       KVM_PGTABLE_WALK_SHARED);
> > +
> > +       kvm_release_faultin_page(kvm, page, !!ret, write_fault);
> > +       read_unlock(&kvm->mmu_lock);
> > +
> > +       if (ret && !fault_is_perm)
> > +               account_locked_vm(mm, 1, false);
> > +unlock_page:
> > +       unlock_page(page);
> > +       put_page(page);
>
> There's a double-free of `page` here, as kvm_release_faultin_page
> already calls put_page. I fixed it up locally with
>
> +       unlock_page(page);
>         kvm_release_faultin_page(kvm, page, !!ret, write_fault);
>         read_unlock(&kvm->mmu_lock);
>
>         if (ret && !fault_is_perm)
>                 account_locked_vm(mm, 1, false);
> +       goto out;
> +
>  unlock_page:
>         unlock_page(page);
>         put_page(page);
> -
> +out:
>         return ret != -EAGAIN ? ret : 0;
>  }
>
> which I'm admittedly not sure is correct either because now the locks
> don't get released in reverse order of acquisition, but with this I
> was able to boot simple VMs.

Thanks for that. You're right, I broke this code right before sending
out the series while fixing a merge conflict. have prepared a new
patch series (rebased on Linux 6.13-rc7), with this redone to be part
of  user_mem_abort(), as opposed to being in its own function. Makes
the code cleaner more maintainable.

> > +
> > +       return ret != -EAGAIN ? ret : 0;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                           struct kvm_s2_trans *nested,
> >                           struct kvm_memory_slot *memslot, unsigned long hva,
> > @@ -1900,8 +2001,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> >                 goto out_unlock;
> >         }
> >
> > -       ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> > -                            esr_fsc_is_permission_fault(esr));
> > +       if (kvm_slot_can_be_private(memslot)) {
>
> For my setup, I needed
>
> if (kvm_mem_is_private(vcpu->kvm, gfn))
>
> here instead, because I am making use of KVM_GENERIC_MEMORY_ATTRIBUTES,
> and  had a memslot with the `KVM_MEM_GUEST_MEMFD` flag set, but whose
> gfn range wasn't actually set to KVM_MEMORY_ATTRIBUTE_PRIVATE.
>
> If I'm reading patch 12 correctly, your memslots always set only one of
> userspace_addr or guest_memfd, and the stage 2 table setup simply checks
> which one is the case to decide what to fault in, so maybe to support
> both cases, this check should be
>
> if (kvm_mem_is_private(vcpu->kvm, gfn) || (kvm_slot_can_be_private(memslot) && !memslot->userspace_addr)
>
> ?

I've actually missed supporting both cases, and I think your
suggestion is the right way to do it. I'll fix it in the respin.

Cheers,
/fuad

>
> [1]: https://lore.kernel.org/all/20240801090117.3841080-1-tabba@xxxxxxxxxx/
>
> > +               ret = guest_memfd_abort(vcpu, fault_ipa, memslot,
> > +                                       esr_fsc_is_permission_fault(esr));
> > +       } else {
> > +               ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> > +                                    esr_fsc_is_permission_fault(esr));
> > +       }
> > +
> >         if (ret == 0)
> >                 ret = 1;
> >  out:
> > --
> > 2.47.1.613.gc27f4b7a9f-goog
>
> Best,
> Patrick
>