Oh! That (s/__get_user_pages()/__get_user_pages_locked()/) seems to fix it. Attached a patch (gup-mlockall.txt, against v5.3.8) - although, I've got no clue how patches work here. gup-debug-mlockall.txt is the debug printk() I've added plus the code change. dmesg-out.txt.gz is the dmesg output of the latter for test.c On Wed, 2019-11-06 at 12:26 +0100, Robert Stupp wrote: > Maybe a native and wrong idea, but would it work to call > __get_user_pages_locked() instead of __get_user_pages() in > populate_vma_page_range() ? > > On Wed, 2019-11-06 at 11:25 +0100, Robert Stupp wrote: > > Here's one more dmesg output with more information captured in > > __get_user_pages() as well. It basically confirms that > > handle_mm_fault() returns VM_FAULT_RETRY. > > > > I'm not sure where and what to change ("fix with a FOLL_TRIED > > somewhere") to make it work. My (uneducated) impression is, that > > only > > __get_user_pages() needs to be changed - but I might be wrong. > >
commit d2a1ce44cd81b8ccf426a6c40a3a0d9b2b51e076 Author: Robert Stupp <snazy@xxxxxxxx> Date: Wed Nov 6 12:58:12 2019 +0100 make mlockall(MCL_CURRENT) aware of new VM_FAULT_RETRY conditions mlockall(MCL_CURRENT) can run into an busy/endless loop in __mm_populate(). This can happen since 6b4c9f4469819a0c1a38a0a4541337e0f9bf6c11 (introduced in Linux 5.1), "filemap: drop the mmap_sem for all blocking operations" / new VM_FAULT_RETRY conditions. The fix here is to replace the call to __get_user_pages() with a call to __get_user_pages_locked() in populate_vma_page_range(). This change seems to fix the behavior for mlockall(MCL_CURRENT) but does not change the behavior for other usages of populate_vma_page_range() in find_extend_vma() and mprotect_fixup(). (Text by Johannes Weiner hannes-at-cmpxchg.org): The only way this can occur is if populate_vma_page_range() returns 0 and we don't advance the iteration position (if it returned an error, we wouldn't reset nend and move on to the next vma as ignore_errors is 1 for mlockall.) populate_vma_page_range() returns 0 when the first page is not found and faultin_page() returns -EBUSY (if it were processing pages, or if the error from faultin_page() would be a different one, we would return the number of pages processed or -error). faultin_page() returns -EBUSY when VM_FAULT_RETRY is set, i.e. we dropped the mmap_sem in order to initiate IO and require a retry. That is consistent with the bisect result (new VM_FAULT_RETRY conditions). At this point, regular page fault would retry with FAULT_FLAG_TRIED to indicate that the mmap_sem cannot be dropped a second time. diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..e319ffb625b1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1227,8 +1227,8 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + return __get_user_pages_locked(current, mm, start, nr_pages, + NULL, NULL, nonblocking, gup_flags); } /*
commit 8c2e4c28bde0b10ba488d32d43d644612b2df737 Author: Robert Stupp <snazy@xxxxxxxx> Date: Wed Nov 6 12:29:53 2019 +0100 Replace __get_user_pages() with __get_user_pages_locked() in populate_vma_page_range() diff --git a/mm/gup.c b/mm/gup.c index 5c9825745bb2..c0b61b3130a8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1065,14 +1065,15 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL_GPL(fixup_user_fault); -static __always_inline long __get_user_pages_locked(struct task_struct *tsk, +static __always_inline long __get_user_pages_locked_x(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, int *locked, - unsigned int flags) + unsigned int flags, + int ign) { long ret, pages_done; bool lock_dropped; @@ -1090,8 +1091,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, pages_done = 0; lock_dropped = false; for (;;) { - ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, - vmas, locked); + ret = __get_user_pages_x(tsk, mm, start, nr_pages, flags, pages, + vmas, locked, ign); if (!locked) /* VM_FAULT_RETRY couldn't trigger, bypass */ return ret; @@ -1133,8 +1134,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, *locked = 1; lock_dropped = true; down_read(&mm->mmap_sem); - ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, - pages, NULL, NULL); + ret = __get_user_pages_x(tsk, mm, start, 1, flags | FOLL_TRIED, + pages, NULL, NULL, ign); if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) @@ -1159,6 +1160,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, } return pages_done; } +static __always_inline long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, + unsigned int flags) +{ + return __get_user_pages_locked_x(tsk, mm, start, nr_pages, pages, vmas, locked, flags, 1); +} /* * get_user_pages_remote() - pin user pages in memory @@ -1291,8 +1303,10 @@ long populate_vma_page_range_x(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages_x(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking, ign); + return __get_user_pages_locked_x(current, mm, start, nr_pages, + NULL, NULL, nonblocking, gup_flags, ign); +// return __get_user_pages_x(current, mm, start, nr_pages, gup_flags, +// NULL, NULL, nonblocking, ign); } long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) commit 8b0bed7848a32c74512e8278c94de16eea390274 Author: Robert Stupp <snazy@xxxxxxxx> Date: Wed Nov 6 09:50:32 2019 +0100 More printk diff --git a/mm/gup.c b/mm/gup.c index 3bc25fd44433..5c9825745bb2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -627,8 +627,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. * If it is, *@nonblocking will be set to 0 and -EBUSY returned. */ -static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, int *nonblocking) +static int faultin_page_x(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking, int ign) { unsigned int fault_flags = 0; vm_fault_t ret; @@ -650,8 +650,14 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, } ret = handle_mm_fault(vma, address, fault_flags); + if (!ign) { + printk(KERN_WARNING "faultin_page handle_mm_fault --> ret = %u\n", ret); + } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); + if (!ign) { + printk(KERN_WARNING "faultin_page handle_mm_fault --> err = %d\n", err); + } if (err) return err; @@ -666,6 +672,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { + if (!ign) { + printk(KERN_WARNING "faultin_page-->EBUSY VM_FAULT_RETRY non-blocking?%d FAULT_FLAG_RETRY_NOWAIT?%d\n", nonblocking?1:0, (fault_flags & FAULT_FLAG_RETRY_NOWAIT)?1:0); + } if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; return -EBUSY; @@ -682,8 +691,16 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) *flags |= FOLL_COW; + if (!ign) { + printk(KERN_WARNING "faultin_page-->0\n"); + } return 0; } +static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking) +{ + return faultin_page_x(tsk, vma, address, flags, nonblocking, 1); +} static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { @@ -788,15 +805,18 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages_x(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) + struct vm_area_struct **vmas, int *nonblocking, int ign) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; + if (!ign) + printk(KERN_WARNING "__get_user_pages start=%lx nr_pages=%lu gup_flags=%x ctx.page_mask=%u\n", start, nr_pages, gup_flags, ctx.page_mask); + if (!nr_pages) return 0; @@ -817,11 +837,25 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { + if (!ign) { + if (!vma) + printk(KERN_WARNING "__get_user_pages @1 vma==NULL\n"); + else + printk(KERN_WARNING "__get_user_pages @1 start=%lx vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, vma->vm_start, vma->vm_end, vma->vm_flags); + } vma = find_extend_vma(mm, start); + if (!ign) { + if (!vma) + printk(KERN_WARNING "__get_user_pages @2 vma==NULL\n"); + else + printk(KERN_WARNING "__get_user_pages @2 start=%lx vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, vma->vm_start, vma->vm_end, vma->vm_flags); + } if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &pages[i] : NULL); + if (!ign) + printk(KERN_WARNING "__get_user_pages @3 get_gate_page --> %ld\n", ret); if (ret) goto out; ctx.page_mask = 0; @@ -829,6 +863,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (!vma || check_vma_flags(vma, gup_flags)) { + if (!ign) + printk(KERN_WARNING "__get_user_pages @4 EFAULT\n"); ret = -EFAULT; goto out; } @@ -836,6 +872,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags, nonblocking); + if (!ign) + printk(KERN_WARNING "__get_user_pages @5 follow_hugetlb_page --> %ld\n", i); continue; } } @@ -845,15 +883,21 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * potentially allocating memory. */ if (fatal_signal_pending(current)) { + if (!ign) + printk(KERN_WARNING "__get_user_pages @6 ERESTARTSYS\n"); ret = -ERESTARTSYS; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); + if (!ign) + printk(KERN_WARNING "__get_user_pages @7 follow_page_mask --> %d ctx.page_mask=%u\n", page ? 1 : 0, ctx.page_mask); if (!page) { - ret = faultin_page(tsk, vma, start, &foll_flags, - nonblocking); + ret = faultin_page_x(tsk, vma, start, &foll_flags, + nonblocking, ign); + if (!ign) + printk(KERN_WARNING "__get_user_pages @8 faultin_page --> %ld\n", ret); switch (ret) { case 0: goto retry; @@ -869,6 +913,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } BUG(); } else if (PTR_ERR(page) == -EEXIST) { + if (!ign) + printk(KERN_WARNING "__get_user_pages @8 EEXIST\n"); /* * Proper page table entry exists, but no corresponding * struct page. @@ -876,6 +922,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); + if (!ign) + printk(KERN_WARNING "__get_user_pages @8 IS_ERR -> ret=%ld\n", ret); goto out; } if (pages) { @@ -890,17 +938,31 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ctx.page_mask = 0; } page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); + if (!ign) + printk(KERN_WARNING "__get_user_pages @9 page_increm=%u ctx.page_mask=%u\n", page_increm, ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; + if (!ign) + printk(KERN_WARNING "__get_user_pages @10 i=%ld start=%lx nr_pages=%ld\n", i, start, nr_pages); } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); + if (!ign) { + printk(KERN_WARNING "__get_user_pages LEAVE i=%ld ret=%ld\n", i, ret); + } return i ? i : ret; } +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + return __get_user_pages_x(tsk, mm, start, nr_pages, gup_flags, pages, vmas, nonblocking, 1); +} static bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) @@ -1193,8 +1255,9 @@ EXPORT_SYMBOL(get_user_pages_remote); * If @nonblocking is non-NULL, it must held for read only and may be * released. If it's released, *@nonblocking will be set to 0. */ -long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) +long populate_vma_page_range_x(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking, + int ign) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; @@ -1228,8 +1291,13 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + return __get_user_pages_x(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking, ign); +} +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + return populate_vma_page_range_x(vma, start, end, nonblocking, 1); } /* @@ -1292,7 +1360,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ - ret = populate_vma_page_range(vma, nstart, nend, &locked); + ret = populate_vma_page_range_x(vma, nstart, nend, &locked, ign); if (ret < 0) { if (!ign) printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-2 %ld\n", start, len, end, ignore_errors, ret); commit 74da9b1c9bbaf66320d973ff7c8dc63962b6dc7d Author: Robert Stupp <snazy@xxxxxxxx> Date: Tue Nov 5 17:18:49 2019 +0100 Add debug things diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..3bc25fd44433 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> +#include <linux/printk.h> #include <linux/mm.h> #include <linux/memremap.h> @@ -1241,14 +1242,23 @@ long populate_vma_page_range(struct vm_area_struct *vma, int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; + unsigned long end, nstart, nend = 0L; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; + unsigned long nstart_prev = 0L - 1L, nend_prev = 0L - 1L; + int ign; end = start + len; + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d ENTER\n", start, len, end, ignore_errors); + for (nstart = start; nstart < end; nstart = nend) { + ign = nstart == nstart_prev && nend == nend_prev; + nstart_prev = nstart; + nend_prev = nend; + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP %lx %d %ld\n", start, len, end, ignore_errors, nstart, locked, ret); /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. @@ -1259,6 +1269,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) vma = find_vma(mm, nstart); } else if (nstart >= vma->vm_end) vma = vma->vm_next; + if (!ign && vma) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, len, end, ignore_errors, vma->vm_start, vma->vm_end, vma->vm_flags); if (!vma || vma->vm_start >= end) break; /* @@ -1266,8 +1278,13 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d nend=%lx %lx %lx\n", start, len, end, ignore_errors, nend, end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-1 %lx\n", start, len, end, ignore_errors, vma->vm_flags); + continue; + } if (nstart < vma->vm_start) nstart = vma->vm_start; /* @@ -1277,6 +1294,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-2 %ld\n", start, len, end, ignore_errors, ret); if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ @@ -1284,8 +1303,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) break; } nend = nstart + ret * PAGE_SIZE; + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-3 ret=%ld nend=%lx\n", start, len, end, ignore_errors, ret, nend); ret = 0; } + printk(KERN_WARNING "_mm_populate END %lu %lu %d\n", start, len, locked); if (locked) up_read(&mm->mmap_sem); return ret; /* 0 or negative error code */
Attachment:
dmesg-out.txt.gz
Description: application/gzip
#include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> int main(int argc, char** argv) { if (argc > 1) { printf("Sleeping... %s\n", argv[1]); sleep(atoi(argv[1])); } printf("Before mlockall(MCL_CURRENT)\n"); mlockall(MCL_CURRENT); printf("After mlockall(MCL_CURRENT)\n"); }