On Fri, Aug 09, 2024, Peter Xu wrote: > Introduce a pair of APIs to follow pfn mappings to get entry information. > It's very similar to what follow_pte() does before, but different in that > it recognizes huge pfn mappings. ... > +int follow_pfnmap_start(struct follow_pfnmap_args *args); > +void follow_pfnmap_end(struct follow_pfnmap_args *args); I find the start+end() terminology to be unintuitive. E.g. I had to look at the implementation to understand why KVM invoke fixup_user_fault() if follow_pfnmap_start() failed. What about follow_pfnmap_and_lock()? And then maybe follow_pfnmap_unlock()? Though that second one reads a little weird. > + * Return: zero on success, -ve otherwise. ve? > +int follow_pfnmap_start(struct follow_pfnmap_args *args) > +{ > + struct vm_area_struct *vma = args->vma; > + unsigned long address = args->address; > + struct mm_struct *mm = vma->vm_mm; > + spinlock_t *lock; > + pgd_t *pgdp; > + p4d_t *p4dp, p4d; > + pud_t *pudp, pud; > + pmd_t *pmdp, pmd; > + pte_t *ptep, pte; > + > + pfnmap_lockdep_assert(vma); > + > + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) > + goto out; > + > + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) > + goto out; Why use goto intead of simply? return -EINVAL; That's relevant because I think the cases where no PxE is found should return -ENOENT, not -EINVAL. E.g. if the caller doesn't precheck, then it can bail immediately on EINVAL, but know that it's worth trying to fault-in the pfn on ENOENT. > +retry: > + pgdp = pgd_offset(mm, address); > + if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) > + goto out; > + > + p4dp = p4d_offset(pgdp, address); > + p4d = READ_ONCE(*p4dp); > + if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) > + goto out; > + > + pudp = pud_offset(p4dp, address); > + pud = READ_ONCE(*pudp); > + if (pud_none(pud)) > + goto out; > + if (pud_leaf(pud)) { > + lock = pud_lock(mm, pudp); > + if (!unlikely(pud_leaf(pud))) { > + spin_unlock(lock); > + goto retry; > + } > + pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), > + pud_pfn(pud), PUD_MASK, pud_write(pud), > + pud_special(pud)); > + return 0; > + } > + > + pmdp = pmd_offset(pudp, address); > + pmd = pmdp_get_lockless(pmdp); > + if (pmd_leaf(pmd)) { > + lock = pmd_lock(mm, pmdp); > + if (!unlikely(pmd_leaf(pmd))) { > + spin_unlock(lock); > + goto retry; > + } > + pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), > + pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), > + pmd_special(pmd)); > + return 0; > + } > + > + ptep = pte_offset_map_lock(mm, pmdp, address, &lock); > + if (!ptep) > + goto out; > + pte = ptep_get(ptep); > + if (!pte_present(pte)) > + goto unlock; > + pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), > + pte_pfn(pte), PAGE_MASK, pte_write(pte), > + pte_special(pte)); > + return 0; > +unlock: > + pte_unmap_unlock(ptep, lock); > +out: > + return -EINVAL; > +} > +EXPORT_SYMBOL_GPL(follow_pfnmap_start);