From: Zi Yan <ziy@xxxxxxxxxx> Add follow_page support for PUD THPs. Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> --- include/linux/huge_mm.h | 11 +++++++ mm/gup.c | 60 ++++++++++++++++++++++++++++++++- mm/huge_memory.c | 73 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 142 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 589e5af5a1c2..c7bc40c4a5e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -20,6 +20,10 @@ extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); extern int do_huge_pud_anonymous_page(struct vm_fault *vmf); extern vm_fault_t do_huge_pud_wp_page(struct vm_fault *vmf, pud_t orig_pud); +extern struct page *follow_trans_huge_pud(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pud, + unsigned int flags); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { @@ -32,6 +36,13 @@ extern vm_fault_t do_huge_pud_wp_page(struct vm_fault *vmf, pud_t orig_pud) { return VM_FAULT_FALLBACK; } +struct page *follow_trans_huge_pud(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pud, + unsigned int flags) +{ + return NULL; +} #endif extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); diff --git a/mm/gup.c b/mm/gup.c index b21cc220f036..972cca69f228 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -696,10 +696,68 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (page) return page; } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + if (likely(!pud_trans_huge(*pud))) { + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + return follow_pmd_mask(vma, address, pud, flags, ctx); + } + + ptl = pud_lock(mm, pud); + + if (unlikely(!pud_trans_huge(*pud))) { + spin_unlock(ptl); + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + return follow_pmd_mask(vma, address, pud, flags, ctx); + } + + if (flags & FOLL_SPLIT) { + int ret; + pmd_t *pmd = NULL; + + page = pud_page(*pud); + if (is_huge_zero_page(page)) { + + spin_unlock(ptl); + ret = 0; + split_huge_pud(vma, pud, address); + pmd = pmd_offset(pud, address); + split_huge_pmd(vma, pmd, address); + if (pmd_trans_unstable(pmd)) + ret = -EBUSY; + } else { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_pud_page(page); + if (!ret) + ret = split_huge_page(page); + else { + unlock_page(page); + put_page(page); + goto out; + } + unlock_page(page); + put_page(page); + if (pud_none(*pud)) + return no_page_table(vma, flags); + pmd = pmd_offset(pud, address); + } +out: + return ret ? ERR_PTR(ret) : + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + } + page = follow_trans_huge_pud(vma, address, pud, flags); + spin_unlock(ptl); + ctx->page_mask = HPAGE_PUD_NR - 1; + return page; +#else if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); - return follow_pmd_mask(vma, address, pud, flags, ctx); +#endif } static struct page *follow_p4d_mask(struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9aa19aa643cd..61ae7a0ded84 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1258,6 +1258,77 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return page; } +/* + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pud(pud_t pud, unsigned int flags) +{ + return pud_write(pud) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pud_dirty(pud)); +} + +struct page *follow_trans_huge_pud(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pud, + unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + + assert_spin_locked(pud_lockptr(mm, pud)); + + if (flags & FOLL_WRITE && !can_follow_write_pud(*pud, flags)) + goto out; + + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pud(*pud)) + return ERR_PTR(-EFAULT); + + /* Full NUMA hinting faults to serialise migration in fault paths */ + /*&& pud_protnone(*pmd)*/ + if ((flags & FOLL_NUMA)) + goto out; + + page = pud_page(*pud); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pud, flags); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * We don't mlock() pte-mapped THPs. This way we can avoid + * leaking mlocked pages into non-VM_LOCKED VMAs. + * + * For anon THP: + * + * We do the same thing as PMD-level THP. + * + * For file THP: + * + * No support yet. + * + */ + + if (PageAnon(page) && compound_mapcount(page) != 1) + goto skip_mlock; + if (PagePUDDoubleMap(page) || !page->mapping) + goto skip_mlock; + if (!trylock_page(page)) + goto skip_mlock; + lru_add_drain(); + if (page->mapping && !PagePUDDoubleMap(page)) + mlock_vma_page(page); + unlock_page(page); + } +skip_mlock: + page += (addr & ~HPAGE_PUD_MASK) >> PAGE_SHIFT; + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma) @@ -1462,7 +1533,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto out; page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page) && !PMDPageInPUD(page), page); if (!try_grab_page(page, flags)) return ERR_PTR(-ENOMEM); -- 2.28.0