On Tuesday 11 Feb 2025 at 16:13:27 (+0000), Fuad Tabba wrote: > Hi Quentin, > > On Tue, 11 Feb 2025 at 15:57, Quentin Perret <qperret@xxxxxxxxxx> wrote: > > > > Hey Fuad, > > > > On Tuesday 11 Feb 2025 at 12:11:24 (+0000), Fuad Tabba wrote: > > > Add arm64 support for handling guest page faults on guest_memfd > > > backed memslots. > > > > > > For now, the fault granule is restricted to PAGE_SIZE. > > > > > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> > > > --- > > > arch/arm64/kvm/mmu.c | 84 ++++++++++++++++++++++++++-------------- > > > include/linux/kvm_host.h | 5 +++ > > > virt/kvm/kvm_main.c | 5 --- > > > 3 files changed, 61 insertions(+), 33 deletions(-) > > > > > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > > > index b6c0acb2311c..305060518766 100644 > > > --- a/arch/arm64/kvm/mmu.c > > > +++ b/arch/arm64/kvm/mmu.c > > > @@ -1454,6 +1454,33 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > > > return vma->vm_flags & VM_MTE_ALLOWED; > > > } > > > > > > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > > > + gfn_t gfn, bool write_fault, bool *writable, > > > + struct page **page, bool is_private) > > > +{ > > > + kvm_pfn_t pfn; > > > + int ret; > > > + > > > + if (!is_private) > > > + return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); > > > + > > > + *writable = false; > > > + > > > + if (WARN_ON_ONCE(write_fault && memslot_is_readonly(slot))) > > > + return KVM_PFN_ERR_NOSLOT_MASK; > > > > I believe this check is superfluous, we should decide to report an MMIO > > exit to userspace for write faults to RO memslots and not get anywhere > > near user_mem_abort(). And nit but the error code should probably be > > KVM_PFN_ERR_RO_FAULT or something instead? > > I tried to replicate the behavior of __kvm_faultin_pfn() here (but got > the wrong error!). I think you're right though that in the arm64 case, > this check isn't needed. Should I fix the return error and keep the > warning though? __kvm_faultin_pfn() will just set *writable to false if it find an RO memslot apparently, not return an error. So I'd vote for dropping that check so we align with that behaviour. > > > + > > > + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); > > > + if (!ret) { > > > + *writable = write_fault; > > > > In normal KVM, if we're not dirty logging we'll actively map the page as > > writable if both the memslot and the userspace mappings are writable. > > With gmem, the latter doesn't make much sense, but essentially the > > underlying page should really be writable (e.g. no CoW getting in the > > way and such?). If so, then perhaps make this > > > > *writable = !memslot_is_readonly(slot); > > > > Wdyt? > > Ack. > > > > + return pfn; > > > + } > > > + > > > + if (ret == -EHWPOISON) > > > + return KVM_PFN_ERR_HWPOISON; > > > + > > > + return KVM_PFN_ERR_NOSLOT_MASK; > > > +} > > > + > > > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > > struct kvm_s2_trans *nested, > > > struct kvm_memory_slot *memslot, unsigned long hva, > > > @@ -1461,25 +1488,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > > { > > > int ret = 0; > > > bool write_fault, writable; > > > - bool exec_fault, mte_allowed; > > > + bool exec_fault, mte_allowed = false; > > > bool device = false, vfio_allow_any_uc = false; > > > unsigned long mmu_seq; > > > phys_addr_t ipa = fault_ipa; > > > struct kvm *kvm = vcpu->kvm; > > > - struct vm_area_struct *vma; > > > + struct vm_area_struct *vma = NULL; > > > short vma_shift; > > > void *memcache; > > > - gfn_t gfn; > > > + gfn_t gfn = ipa >> PAGE_SHIFT; > > > kvm_pfn_t pfn; > > > bool logging_active = memslot_is_logging(memslot); > > > - bool force_pte = logging_active || is_protected_kvm_enabled(); > > > - long vma_pagesize, fault_granule; > > > + bool is_private = kvm_mem_is_private(kvm, gfn); > > > > Just trying to understand the locking rule for the xarray behind this. > > Is it kvm->srcu that protects it for reads here? Something else? > > I'm not sure I follow. Which xarray are you referring to? Sorry, yes, that wasn't clear. I meant that kvm_mem_is_private() calls kvm_get_memory_attributes() which indexes kvm->mem_attr_array. The comment in struct kvm indicates that this xarray is protected by RCU for readers, so I was just checking if we were relying on kvm_handle_guest_abort() to take srcu_read_lock(&kvm->srcu) for us, or if there was something else more subtle here. Cheers, Quentin > > > > > + bool force_pte = logging_active || is_private || is_protected_kvm_enabled(); > > > + long vma_pagesize, fault_granule = PAGE_SIZE; > > > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; > > > struct kvm_pgtable *pgt; > > > struct page *page; > > > enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; > > > > > > - if (fault_is_perm) > > > + if (fault_is_perm && !is_private) > > > > Nit: not strictly necessary I think. > > You're right. > > Thanks, > /fuad > > > > fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); > > > write_fault = kvm_is_write_fault(vcpu); > > > exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); > > > @@ -1510,24 +1538,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > > return ret; > > > } > > > > > > + mmap_read_lock(current->mm); > > > + > > > /* > > > * Let's check if we will get back a huge page backed by hugetlbfs, or > > > * get block mapping for device MMIO region. > > > */ > > > - mmap_read_lock(current->mm); > > > - vma = vma_lookup(current->mm, hva); > > > - if (unlikely(!vma)) { > > > - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > > > - mmap_read_unlock(current->mm); > > > - return -EFAULT; > > > - } > > > + if (!is_private) { > > > + vma = vma_lookup(current->mm, hva); > > > + if (unlikely(!vma)) { > > > + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > > > + mmap_read_unlock(current->mm); > > > + return -EFAULT; > > > + } > > > > > > - /* > > > - * logging_active is guaranteed to never be true for VM_PFNMAP > > > - * memslots. > > > - */ > > > - if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP))) > > > - return -EFAULT; > > > + /* > > > + * logging_active is guaranteed to never be true for VM_PFNMAP > > > + * memslots. > > > + */ > > > + if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP))) > > > + return -EFAULT; > > > + > > > + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > > > + mte_allowed = kvm_vma_mte_allowed(vma); > > > + } > > > > > > if (force_pte) > > > vma_shift = PAGE_SHIFT; > > > @@ -1597,18 +1631,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > > ipa &= ~(vma_pagesize - 1); > > > } > > > > > > - gfn = ipa >> PAGE_SHIFT; > > > - mte_allowed = kvm_vma_mte_allowed(vma); > > > - > > > - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > > > - > > > /* Don't use the VMA after the unlock -- it may have vanished */ > > > vma = NULL; > > > > > > /* > > > * Read mmu_invalidate_seq so that KVM can detect if the results of > > > - * vma_lookup() or __kvm_faultin_pfn() become stale prior to > > > - * acquiring kvm->mmu_lock. > > > + * vma_lookup() or faultin_pfn() become stale prior to acquiring > > > + * kvm->mmu_lock. > > > * > > > * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs > > > * with the smp_wmb() in kvm_mmu_invalidate_end(). > > > @@ -1616,8 +1645,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > > mmu_seq = vcpu->kvm->mmu_invalidate_seq; > > > mmap_read_unlock(current->mm); > > > > > > - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, > > > - &writable, &page); > > > + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_private); > > > if (pfn == KVM_PFN_ERR_HWPOISON) { > > > kvm_send_hwpoison_signal(hva, vma_shift); > > > return 0; > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > index 39fd6e35c723..415c6274aede 100644 > > > --- a/include/linux/kvm_host.h > > > +++ b/include/linux/kvm_host.h > > > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) > > > return gfn_to_memslot(kvm, gfn)->id; > > > } > > > > > > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) > > > +{ > > > + return slot->flags & KVM_MEM_READONLY; > > > +} > > > + > > > static inline gfn_t > > > hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) > > > { > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > index 38f0f402ea46..3e40acb9f5c0 100644 > > > --- a/virt/kvm/kvm_main.c > > > +++ b/virt/kvm/kvm_main.c > > > @@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) > > > return size; > > > } > > > > > > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot) > > > -{ > > > - return slot->flags & KVM_MEM_READONLY; > > > -} > > > - > > > static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, > > > gfn_t *nr_pages, bool write) > > > { > > > -- > > > 2.48.1.502.g6dc24dfdaf-goog > > >