On 10/21/22 16:36, James Houghton wrote: > This enables high-granularity mapping support in GUP. > > One important change here is that, before, we never needed to grab the > VMA lock, but now, to prevent someone from collapsing the page tables > out from under us, we grab it for reading when doing high-granularity PT > walks. Once again, I think Peter's series will already take the vma lock here. > In case it is confusing, pfn_offset is the offset (in PAGE_SIZE units) > that vaddr points to within the subpage that hpte points to. > > Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx> > --- > mm/hugetlb.c | 76 ++++++++++++++++++++++++++++++++++++---------------- > 1 file changed, 53 insertions(+), 23 deletions(-) > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 2d096cef53cd..d76ab32fb6d3 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -6382,11 +6382,9 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, > } > } > > -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, > +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t pteval, > bool *unshare) > { > - pte_t pteval = huge_ptep_get(pte); > - > *unshare = false; > if (is_swap_pte(pteval)) > return true; > @@ -6478,12 +6476,20 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > struct hstate *h = hstate_vma(vma); > int err = -EFAULT, refs; > > + /* > + * Grab the VMA lock for reading now so no one can collapse the page > + * table from under us. > + */ > + hugetlb_vma_lock_read(vma); > + > while (vaddr < vma->vm_end && remainder) { > - pte_t *pte; > + pte_t *ptep, pte; Thanks, that really would be better as ptep in the existing code. > spinlock_t *ptl = NULL; > bool unshare = false; > int absent; > - struct page *page; > + unsigned long pages_per_hpte; > + struct page *page, *subpage; > + struct hugetlb_pte hpte; > > /* > * If we have a pending SIGKILL, don't keep faulting pages and > @@ -6499,13 +6505,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > * each hugepage. We have to make sure we get the > * first, for the page indexing below to work. > * > - * Note that page table lock is not held when pte is null. > + * Note that page table lock is not held when ptep is null. > */ > - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), > - huge_page_size(h)); > - if (pte) > - ptl = huge_pte_lock(h, mm, pte); > - absent = !pte || huge_pte_none(huge_ptep_get(pte)); > + ptep = huge_pte_offset(mm, vaddr & huge_page_mask(h), > + huge_page_size(h)); > + if (ptep) { > + hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h), > + hpage_size_to_level(huge_page_size(h))); > + hugetlb_hgm_walk(mm, vma, &hpte, vaddr, > + PAGE_SIZE, > + /*stop_at_none=*/true); > + ptl = hugetlb_pte_lock(mm, &hpte); > + ptep = hpte.ptep; > + pte = huge_ptep_get(ptep); > + } > + > + absent = !ptep || huge_pte_none(pte); In Peter's series, huge_pte_offset calls are replaced with hugetlb_walk that takes a vma pointer. It might make sense now to consolidate the hugetlb page table walkers. I know that was discussed at some time. Just thinking we could possibly fold much of the above into hugetlb_walk. > > /* > * When coredumping, it suits get_dump_page if we just return > @@ -6516,12 +6531,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > */ > if (absent && (flags & FOLL_DUMP) && > !hugetlbfs_pagecache_present(h, vma, vaddr)) { > - if (pte) > + if (ptep) > spin_unlock(ptl); > remainder = 0; > break; > } > > + if (!absent && pte_present(pte) && > + !hugetlb_pte_present_leaf(&hpte, pte)) { > + /* We raced with someone splitting the PTE, so retry. */ I do not think I have got to the splitting code yet, but I am assuming we do not hold vma lock for write when splitting. We would of course hold page table lock. > + spin_unlock(ptl); > + continue; > + } > + > /* > * We need call hugetlb_fault for both hugepages under migration > * (in which case hugetlb_fault waits for the migration,) and > @@ -6537,7 +6559,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > vm_fault_t ret; > unsigned int fault_flags = 0; > > - if (pte) > + /* Drop the lock before entering hugetlb_fault. */ > + hugetlb_vma_unlock_read(vma); > + > + if (ptep) > spin_unlock(ptl); > if (flags & FOLL_WRITE) > fault_flags |= FAULT_FLAG_WRITE; > @@ -6560,7 +6585,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > if (ret & VM_FAULT_ERROR) { > err = vm_fault_to_errno(ret, flags); > remainder = 0; > - break; > + goto out; > } > if (ret & VM_FAULT_RETRY) { > if (locked && > @@ -6578,11 +6603,14 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > */ > return i; > } > + hugetlb_vma_lock_read(vma); > continue; > } > > - pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; > - page = pte_page(huge_ptep_get(pte)); > + pfn_offset = (vaddr & ~hugetlb_pte_mask(&hpte)) >> PAGE_SHIFT; > + subpage = pte_page(pte); > + pages_per_hpte = hugetlb_pte_size(&hpte) / PAGE_SIZE; > + page = compound_head(subpage); > > VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && > !PageAnonExclusive(page), page); > @@ -6592,21 +6620,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > * and skip the same_page loop below. > */ > if (!pages && !vmas && !pfn_offset && > - (vaddr + huge_page_size(h) < vma->vm_end) && > - (remainder >= pages_per_huge_page(h))) { > - vaddr += huge_page_size(h); > - remainder -= pages_per_huge_page(h); > - i += pages_per_huge_page(h); > + (vaddr + pages_per_hpte < vma->vm_end) && > + (remainder >= pages_per_hpte)) { > + vaddr += pages_per_hpte; > + remainder -= pages_per_hpte; > + i += pages_per_hpte; > spin_unlock(ptl); > continue; > } > > /* vaddr may not be aligned to PAGE_SIZE */ > - refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, > + refs = min3(pages_per_hpte - pfn_offset, remainder, > (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); > > if (pages || vmas) > - record_subpages_vmas(nth_page(page, pfn_offset), > + record_subpages_vmas(nth_page(subpage, pfn_offset), > vma, refs, > likely(pages) ? pages + i : NULL, > vmas ? vmas + i : NULL); Not your fault, but all the above was difficult to follow before HGM. :( Did not notice any issues. -- Mike Kravetz > @@ -6637,6 +6665,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, > > spin_unlock(ptl); > } > + hugetlb_vma_unlock_read(vma); > +out: > *nr_pages = remainder; > /* > * setting position is actually required only if remainder is > -- > 2.38.0.135.g90850a2211-goog >