The patch titled Subject: mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM has been added to the -mm tree. Its filename is mm-gup-replace-get_user_pages_longterm-with-foll_longterm.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-gup-replace-get_user_pages_longterm-with-foll_longterm.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-gup-replace-get_user_pages_longterm-with-foll_longterm.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Ira Weiny <ira.weiny@xxxxxxxxx> Subject: mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM Pach series "Add FOLL_LONGTERM to GUP fast and use it". HFI1, qib, and mthca, use get_user_pages_fast() due to its performance advantages. These pages can be held for a significant time. But get_user_pages_fast() does not protect against mapping FS DAX pages. Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which retains the performance while also adding the FS DAX checks. XDP has also shown interest in using this functionality.[1] In addition we change get_user_pages() to use the new FOLL_LONGTERM flag and remove the specialized get_user_pages_longterm call. [1] https://lkml.org/lkml/2019/2/11/1789 This patch (of 7): Rather than have a separate get_user_pages_longterm() call, introduce FOLL_LONGTERM and change the longterm callers to use it. This patch does not change any functionality. FOLL_LONGTERM can only be supported with get_user_pages() as it requires vmas to determine if DAX is in use. Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@xxxxxxxxx Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx> Cc: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxxxx> Cc: John Hubbard <jhubbard@xxxxxxxxxx> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Jason Gunthorpe <jgg@xxxxxxxx> Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> Cc: Paul Mackerras <paulus@xxxxxxxxx> Cc: "David S. Miller" <davem@xxxxxxxxxxxxx> Cc: Martin Schwidefsky <schwidefsky@xxxxxxxxxx> Cc: Heiko Carstens <heiko.carstens@xxxxxxxxxx> Cc: Rich Felker <dalias@xxxxxxxx> Cc: Yoshinori Sato <ysato@xxxxxxxxxxxxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Borislav Petkov <bp@xxxxxxxxx> Cc: Ralf Baechle <ralf@xxxxxxxxxxxxxx> Cc: James Hogan <jhogan@xxxxxxxxxx> Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- arch/powerpc/mm/mmu_context_iommu.c | 3 drivers/infiniband/core/umem.c | 5 drivers/infiniband/hw/qib/qib_user_pages.c | 8 drivers/infiniband/hw/usnic/usnic_uiom.c | 9 drivers/media/v4l2-core/videobuf-dma-sg.c | 6 drivers/vfio/vfio_iommu_type1.c | 3 fs/io_uring.c | 5 include/linux/mm.h | 14 - mm/gup.c | 173 +++++++++++-------- mm/gup_benchmark.c | 5 net/xdp/xdp_umem.c | 4 11 files changed, 130 insertions(+), 105 deletions(-) --- a/arch/powerpc/mm/mmu_context_iommu.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/arch/powerpc/mm/mmu_context_iommu.c @@ -148,7 +148,8 @@ static long mm_iommu_do_alloc(struct mm_ } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + ret = get_user_pages(ua, entries, FOLL_WRITE | FOLL_LONGTERM, + mem->hpages, NULL); up_read(&mm->mmap_sem); if (ret != entries) { /* free the reference taken */ --- a/drivers/infiniband/core/umem.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/drivers/infiniband/core/umem.c @@ -189,10 +189,11 @@ struct ib_umem *ib_umem_get(struct ib_ud while (npages) { down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, vma_list); + gup_flags | FOLL_LONGTERM, + page_list, vma_list); if (ret < 0) { up_read(&mm->mmap_sem); goto umem_release; --- a/drivers/infiniband/hw/qib/qib_user_pages.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/drivers/infiniband/hw/qib/qib_user_pages.c @@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long sta down_read(¤t->mm->mmap_sem); for (got = 0; got < num_pages; got += ret) { - ret = get_user_pages_longterm(start_page + got * PAGE_SIZE, - num_pages - got, - FOLL_WRITE | FOLL_FORCE, - p + got, NULL); + ret = get_user_pages(start_page + got * PAGE_SIZE, + num_pages - got, + FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, + p + got, NULL); if (ret < 0) { up_read(¤t->mm->mmap_sem); goto bail_release; --- a/drivers/infiniband/hw/usnic/usnic_uiom.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned ret = 0; while (npages) { - ret = get_user_pages_longterm(cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) goto out; --- a/drivers/media/v4l2-core/videobuf-dma-sg.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, - flags, dma->pages, NULL); + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, + flags | FOLL_LONGTERM, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; - dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, + dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); return err < 0 ? err : -EINVAL; } --- a/drivers/vfio/vfio_iommu_type1.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/drivers/vfio/vfio_iommu_type1.c @@ -351,7 +351,8 @@ static int vaddr_get_pfn(struct mm_struc down_read(&mm->mmap_sem); if (mm == current->mm) { - ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas); + ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page, + vmas); } else { ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, vmas, NULL); --- a/fs/io_uring.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/fs/io_uring.c @@ -2452,8 +2452,9 @@ static int io_sqe_buffer_register(struct ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { --- a/include/linux/mm.h~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/include/linux/mm.h @@ -1492,19 +1492,6 @@ long get_user_pages_locked(unsigned long long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -#else -static inline long get_user_pages_longterm(unsigned long start, - unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); -} -#endif /* CONFIG_FS_DAX */ - int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); @@ -2570,6 +2557,7 @@ struct page *follow_page(struct vm_area_ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping is intended for a long term pin */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { --- a/mm/gup_benchmark.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/mm/gup_benchmark.c @@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigne pages + i); break; case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, - pages + i, NULL); + nr = get_user_pages(addr, nr, + (gup->flags & 1) | FOLL_LONGTERM, + pages + i, NULL); break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, --- a/mm/gup.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/mm/gup.c @@ -1112,26 +1112,7 @@ long get_user_pages_remote(struct task_s } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1150,12 +1131,6 @@ static bool check_dax_vmas(struct vm_are } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1209,10 +1184,13 @@ static struct page *new_non_cma_page(str return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1268,10 +1246,14 @@ check_again: putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1281,66 +1263,115 @@ check_again: return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which + * allows us to process the FOLL_LONGTERM flag if present. + * + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails + * the pin or attempts to migrate the page as appropriate. + * + * In the filesystem-dax case mappings are subject to the lifetime enforced by + * the filesystem and we need guarantees that longterm users like RDMA and V4L2 + * only establish mappings that have a kernel enforced revocation mechanism. + * + * In the CMA case pages can't be pinned in a CMA region as this would + * unnecessarily fragment that region. So CMA attempts to migrate the page + * before pinning. * * "longterm" == userspace controlled elevated page count lifetime. * Contrast this to iov_iter_get_pages() usages which are transient. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); + } - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; - } - - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. --- a/net/xdp/xdp_umem.c~mm-gup-replace-get_user_pages_longterm-with-foll_longterm +++ a/net/xdp/xdp_umem.c @@ -267,8 +267,8 @@ static int xdp_umem_pin_pages(struct xdp return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages_longterm(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) { _ Patches currently in -mm which might be from ira.weiny@xxxxxxxxx are mm-gup-replace-get_user_pages_longterm-with-foll_longterm.patch mm-gup-change-write-parameter-to-flags-in-fast-walk.patch mm-gup-change-gup-fast-to-use-flags-rather-than-a-write-bool.patch mm-gup-add-foll_longterm-capability-to-gup-fast.patch ib-hfi1-use-the-new-foll_longterm-flag-to-get_user_pages_fast.patch ib-qib-use-the-new-foll_longterm-flag-to-get_user_pages_fast.patch ib-mthca-use-the-new-foll_longterm-flag-to-get_user_pages_fast.patch