From: Peter Xu <peterx@xxxxxxxxxx> Hugepd is only used in PowerPC so far on 4K page size kernels where hash mmu is used. follow_page_mask() used to leverage hugetlb APIs to access hugepd entries. Teach follow_page_mask() itself on hugepd. With previous refactors on fast-gup gup_huge_pd(), most of the code can be easily leveraged. There's something not needed for follow page, for example, gup_hugepte() tries to detect pgtable entry change which will never happen with slow gup (which has the pgtable lock held), but that's not a problem to check. Since follow_page() always only fetch one page, set the end to "address + PAGE_SIZE" should suffice. We will still do the pgtable walk once for each hugetlb page by setting ctx->page_mask properly. One thing worth mentioning is that some level of pgtable's _bad() helper will report is_hugepd() entries as TRUE on Power8 hash MMUs. I think it at least applies to PUD on Power8 with 4K pgsize. It means feeding a hugepd entry to pud_bad() will report a false positive. Let's leave that for now because it can be arch-specific where I am a bit declined to touch. In this patch it's not a problem as long as hugepd is detected before any bad pgtable entries. Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> --- mm/gup.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 00cdf4cb0cd4..43a2e0a203cd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -30,6 +30,11 @@ struct follow_page_context { unsigned int page_mask; }; +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx); + static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { @@ -871,6 +876,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval))))) + return follow_hugepd(vma, __hugepd(pmd_val(pmdval)), + address, PMD_SHIFT, flags, ctx); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -921,6 +929,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = READ_ONCE(*pudp); if (!pud_present(pud)) return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) + return follow_hugepd(vma, __hugepd(pud_val(pud)), + address, PUD_SHIFT, flags, ctx); if (pud_leaf(pud)) { ptl = pud_lock(mm, pudp); page = follow_huge_pud(vma, address, pudp, flags, ctx); @@ -944,10 +955,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); - if (!p4d_present(p4d)) - return no_page_table(vma, flags, address); BUILD_BUG_ON(p4d_leaf(p4d)); - if (unlikely(p4d_bad(p4d))) + + if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) + return follow_hugepd(vma, __hugepd(p4d_val(p4d)), + address, P4D_SHIFT, flags, ctx); + + if (!p4d_present(p4d) || p4d_bad(p4d)) return no_page_table(vma, flags, address); return follow_pud_mask(vma, address, p4dp, flags, ctx); @@ -981,7 +995,7 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { - pgd_t *pgd; + pgd_t *pgd, pgdval; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; @@ -996,11 +1010,17 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, &ctx->page_mask); pgd = pgd_offset(mm, address); + pgdval = *pgd; - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pgd_val(pgdval))))) + page = follow_hugepd(vma, __hugepd(pgd_val(pgdval)), + address, PGDIR_SHIFT, flags, ctx); + else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + page = no_page_table(vma, flags, address); + else + page = follow_p4d_mask(vma, address, pgd, flags, ctx); - return follow_p4d_mask(vma, address, pgd, flags, ctx); + return page; } struct page *follow_page(struct vm_area_struct *vma, unsigned long address, @@ -3037,6 +3057,37 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, return 1; } + +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx) +{ + struct page *page; + struct hstate *h; + spinlock_t *ptl; + int nr = 0, ret; + pte_t *ptep; + + /* Only hugetlb supports hugepd */ + if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma))) + return ERR_PTR(-EFAULT); + + h = hstate_vma(vma); + ptep = hugepte_offset(hugepd, addr, pdshift); + ptl = huge_pte_lock(h, vma->vm_mm, ptep); + ret = gup_huge_pd(hugepd, addr, pdshift, addr + PAGE_SIZE, + flags, &page, &nr); + spin_unlock(ptl); + + if (ret) { + WARN_ON_ONCE(nr != 1); + ctx->page_mask = (1U << huge_page_order(h)) - 1; + return page; + } + + return NULL; +} #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, @@ -3044,6 +3095,14 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, { return 0; } + +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx) +{ + return NULL; +} #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, -- 2.44.0