On Tue, 4 Jul 2023 16:50:47 +0900 David Stevens <stevensd@xxxxxxxxxxxx> wrote: > From: David Stevens <stevensd@xxxxxxxxxxxx> > > Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot. > __kvm_follow_pfn refactors the old API's arguments into a struct and, > where possible, combines the boolean arguments into a single flags > argument. > > Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx> > --- > include/linux/kvm_host.h | 16 ++++ > virt/kvm/kvm_main.c | 171 ++++++++++++++++++++++----------------- > virt/kvm/kvm_mm.h | 3 +- > virt/kvm/pfncache.c | 8 +- > 4 files changed, 122 insertions(+), 76 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 9d3ac7720da9..ef2763c2b12e 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -97,6 +97,7 @@ > #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) > #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) > #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) > +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) > > /* > * error pfns indicate that the gfn is in slot but faild to > @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, > void kvm_release_page_clean(struct page *page); > void kvm_release_page_dirty(struct page *page); > > +struct kvm_follow_pfn { > + const struct kvm_memory_slot *slot; > + gfn_t gfn; > + unsigned int flags; > + bool atomic; > + /* Allow a read fault to create a writeable mapping. */ > + bool allow_write_mapping; > + > + /* Outputs of __kvm_follow_pfn */ > + hva_t hva; > + bool writable; > +}; > + > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll); > + > kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); > kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, > bool *writable); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 371bd783ff2b..b13f22861d2f 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr) > * true indicates success, otherwise false is returned. It's also the > * only part that runs if we can in atomic context. > */ > -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, > - bool *writable, kvm_pfn_t *pfn) > +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn) > { > struct page *page[1]; > + bool write_fault = foll->flags & FOLL_WRITE; > > /* > * Fast pin a writable pfn only if it is a write fault request > * or the caller allows to map a writable pfn for a read fault > * request. > */ > - if (!(write_fault || writable)) > + if (!(write_fault || foll->allow_write_mapping)) > return false; > > - if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { > + if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) { > *pfn = page_to_pfn(page[0]); > - > - if (writable) > - *writable = true; > + foll->writable = foll->allow_write_mapping; > return true; > } > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, > * The slow path to get the pfn of the specified host virtual address, > * 1 indicates success, -errno is returned if error is detected. > */ > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, > - bool interruptible, bool *writable, kvm_pfn_t *pfn) > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn) > { > - unsigned int flags = FOLL_HWPOISON; > + unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags; > struct page *page; > int npages; > > might_sleep(); > > - if (writable) > - *writable = write_fault; > - > - if (write_fault) > - flags |= FOLL_WRITE; > - if (async) > - flags |= FOLL_NOWAIT; > - if (interruptible) > - flags |= FOLL_INTERRUPTIBLE; > - > - npages = get_user_pages_unlocked(addr, 1, &page, flags); > + npages = get_user_pages_unlocked(foll->hva, 1, &page, flags); > if (npages != 1) > return npages; > > + foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping; > + > /* map read fault as writable if possible */ > - if (unlikely(!write_fault) && writable) { > + if (unlikely(!foll->writable) && foll->allow_write_mapping) { > struct page *wpage; > > - if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { > - *writable = true; > + if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) { > + foll->writable = true; > put_page(page); > page = wpage; > } > @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn) > return get_page_unless_zero(page); > } > > -static int hva_to_pfn_remapped(struct vm_area_struct *vma, > - unsigned long addr, bool write_fault, > - bool *writable, kvm_pfn_t *p_pfn) > +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll, > + kvm_pfn_t *p_pfn) > { > kvm_pfn_t pfn; > pte_t *ptep; > spinlock_t *ptl; > + bool write_fault = foll->flags & FOLL_WRITE; > int r; > > - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); > + r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl); > if (r) { > /* > * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does > * not call the fault handler, so do it here. > */ > bool unlocked = false; > - r = fixup_user_fault(current->mm, addr, > + r = fixup_user_fault(current->mm, foll->hva, > (write_fault ? FAULT_FLAG_WRITE : 0), > &unlocked); > if (unlocked) > @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, > if (r) > return r; > > - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); > + r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl); > if (r) > return r; > } > @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, > goto out; > } > > - if (writable) > - *writable = pte_write(*ptep); > + foll->writable = pte_write(*ptep) && foll->allow_write_mapping; > pfn = pte_pfn(*ptep); > > /* > @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, > * 2): @write_fault = false && @writable, @writable will tell the caller > * whether the mapping is writable. > */ > -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, > - bool *async, bool write_fault, bool *writable) > +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll) > { > struct vm_area_struct *vma; > kvm_pfn_t pfn; > int npages, r; > > /* we can do it either atomically or asynchronously, not both */ > - BUG_ON(atomic && async); > + BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT)); > > - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) > + if (hva_to_pfn_fast(foll, &pfn)) > return pfn; > > - if (atomic) > + if (foll->atomic) > return KVM_PFN_ERR_FAULT; > > - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, > - writable, &pfn); > + npages = hva_to_pfn_slow(foll, &pfn); > if (npages == 1) > return pfn; > if (npages == -EINTR) > @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, > > mmap_read_lock(current->mm); > if (npages == -EHWPOISON || > - (!async && check_user_page_hwpoison(addr))) { > + (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) { > pfn = KVM_PFN_ERR_HWPOISON; > goto exit; > } > > retry: > - vma = vma_lookup(current->mm, addr); > + vma = vma_lookup(current->mm, foll->hva); > > if (vma == NULL) > pfn = KVM_PFN_ERR_FAULT; > else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { > - r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); > + r = hva_to_pfn_remapped(vma, foll, &pfn); > if (r == -EAGAIN) > goto retry; > if (r < 0) > pfn = KVM_PFN_ERR_FAULT; > } else { > - if (async && vma_is_valid(vma, write_fault)) > - *async = true; > - pfn = KVM_PFN_ERR_FAULT; > + if ((foll->flags & FOLL_NOWAIT) && > + vma_is_valid(vma, foll->flags & FOLL_WRITE)) > + pfn = KVM_PFN_ERR_NEEDS_IO; > + else > + pfn = KVM_PFN_ERR_FAULT; > } > exit: > mmap_read_unlock(current->mm); > return pfn; > } > > -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, > - bool atomic, bool interruptible, bool *async, > - bool write_fault, bool *writable, hva_t *hva) > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll) > { > - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); > - > - if (hva) > - *hva = addr; > + foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL, > + foll->flags & FOLL_WRITE); > > - if (addr == KVM_HVA_ERR_RO_BAD) { > - if (writable) > - *writable = false; > + if (foll->hva == KVM_HVA_ERR_RO_BAD) > return KVM_PFN_ERR_RO_FAULT; > - } > Can you explain why updating foll->writable = false (previously *writeable = false) is omitted here? In the caller where the struct kvm_follow_pfn is initialized, e.g. __gfn_to_pfn_memslot()/gfn_to_pfn_prot(), .writable is not initialized. IIUC, they expect __kvm_follow_pfn() to update it and return .writable to upper caller. As the one of the output, it would be better to initalize it either in the caller or update it in __kvm_follow_pfn(). Or __gfn_to_pfn_memslot()/gfn_to_pfn_prot() will return random data in the stack to the caller via bool *writable. It doesn't sound nice. BTW: It seems both "writable" and "writeable" are used in this patch. I am wondering maybe we can correct them. > - if (kvm_is_error_hva(addr)) { > - if (writable) > - *writable = false; > + if (kvm_is_error_hva(foll->hva)) > return KVM_PFN_NOSLOT; > - } > > - /* Do not map writable pfn in the readonly memslot. */ > - if (writable && memslot_is_readonly(slot)) { > - *writable = false; > - writable = NULL; > - } > + if (memslot_is_readonly(foll->slot)) > + foll->allow_write_mapping = false; > + > + return hva_to_pfn(foll); > +} > +EXPORT_SYMBOL_GPL(__kvm_follow_pfn); > > - return hva_to_pfn(addr, atomic, interruptible, async, write_fault, > - writable); > +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, > + bool atomic, bool interruptible, bool *async, > + bool write_fault, bool *writable, hva_t *hva) > +{ > + kvm_pfn_t pfn; > + struct kvm_follow_pfn foll = { > + .slot = slot, > + .gfn = gfn, > + .flags = 0, > + .atomic = atomic, > + .allow_write_mapping = !!writable, > + }; > + > + if (write_fault) > + foll.flags |= FOLL_WRITE; > + if (async) > + foll.flags |= FOLL_NOWAIT; > + if (interruptible) > + foll.flags |= FOLL_INTERRUPTIBLE; > + > + pfn = __kvm_follow_pfn(&foll); > + if (pfn == KVM_PFN_ERR_NEEDS_IO) { > + *async = true; > + pfn = KVM_PFN_ERR_FAULT; > + } > + if (hva) > + *hva = foll.hva; > + if (writable) > + *writable = foll.writable; > + return pfn; > } > EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); > > kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, > bool *writable) > { > - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, > - NULL, write_fault, writable, NULL); > + kvm_pfn_t pfn; > + struct kvm_follow_pfn foll = { > + .slot = gfn_to_memslot(kvm, gfn), > + .gfn = gfn, > + .flags = write_fault ? FOLL_WRITE : 0, > + .allow_write_mapping = !!writable, > + }; > + pfn = __kvm_follow_pfn(&foll); > + if (writable) > + *writable = foll.writable; > + return pfn; > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); > > kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) > { > - return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true, > - NULL, NULL); > + struct kvm_follow_pfn foll = { > + .slot = slot, > + .gfn = gfn, > + .flags = FOLL_WRITE, > + }; > + return __kvm_follow_pfn(&foll); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); > > kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) > { > - return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true, > - NULL, NULL); > + struct kvm_follow_pfn foll = { > + .slot = slot, > + .gfn = gfn, > + .flags = FOLL_WRITE, > + .atomic = true, > + }; > + return __kvm_follow_pfn(&foll); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); > > diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h > index 180f1a09e6ba..ed896aee5396 100644 > --- a/virt/kvm/kvm_mm.h > +++ b/virt/kvm/kvm_mm.h > @@ -20,8 +20,7 @@ > #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) > #endif /* KVM_HAVE_MMU_RWLOCK */ > > -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, > - bool *async, bool write_fault, bool *writable); > +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll); > > #ifdef CONFIG_HAVE_KVM_PFNCACHE > void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, > diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c > index 2d6aba677830..e3fefa753a51 100644 > --- a/virt/kvm/pfncache.c > +++ b/virt/kvm/pfncache.c > @@ -144,6 +144,12 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) > kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; > void *new_khva = NULL; > unsigned long mmu_seq; > + struct kvm_follow_pfn foll = { > + .slot = gpc->memslot, > + .gfn = gpa_to_gfn(gpc->gpa), > + .flags = FOLL_WRITE, > + .hva = gpc->uhva, > + }; > > lockdep_assert_held(&gpc->refresh_lock); > > @@ -183,7 +189,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) > } > > /* We always request a writeable mapping */ > - new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); > + new_pfn = hva_to_pfn(&foll); > if (is_error_noslot_pfn(new_pfn)) > goto out_error; >