On Tue, 2019-11-05 at 13:22 -0500, Johannes Weiner wrote: > Judging from Robert's stack captures, the task is not hung but > busy-looping in __mm_populate(). AFAICS, the only way this can occur > is if populate_vma_page_range() returns 0 and we don't advance the > iteration position (if it returned an error, we wouldn't reset nend > and move on to the next vma as ignore_errors is 1 for mlockall.) > > populate_vma_page_range() returns 0 when the first page is not found > and faultin_page() returns -EBUSY (if it were processing pages, or if > the error from faultin_page() would be a different one, we would > return the number of pages processed or -error). > > faultin_page() returns -EBUSY when VM_FAULT_RETRY is set, i.e. we > dropped the mmap_sem in order to initiate IO and require a retry. > That > is consistent with the bisect result (new VM_FAULT_RETRY conditions). > > At this point, regular page fault would retry with FAULT_FLAG_TRIED > to > indicate that the mmap_sem cannot be dropped a second time. But this > mlock path doesn't set that flag and we can loop repeatedly. That is > something we probably need to fix with a FOLL_TRIED somewhere. > > What I don't quite understand yet is why the fault path doesn't make > progress eventually. We must drop the mmap_sem without changing the > state in any way. How can we keep looping on the same page? I've played a bit around by adding some `printk` messages (see attached patch) and found exactly what you describe: it's busy-looping in __mm_populate(), because populate_vma_page_range returns 0. However, there's a slightly interesting thing in there. Before it loops forever, it processes nstart=5574d92e1000 locked=1 vma->vm_start=7f5e4bfec000 vma->vm_end= 7f5e4c011000 vma->vm_flags=8002071 for which populate_vma_page_range() returns 1, then it processes this over and over again: nstart=7f5e4bfed000 locked=0 vma->vm_start=7f5e4bfec000 (same as before) vma->vm_end= 7f5e4c011000 vma->vm_flags=8002071 These are the additional dmesg messages with timestamp 105.x. At timestamp 106.x, I've hit ctrl-c (ret=-512). dmesg output with the patch applied (on top of the v5.3.8 git tag) attached.
diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..3bc25fd44433 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> +#include <linux/printk.h> #include <linux/mm.h> #include <linux/memremap.h> @@ -1241,14 +1242,23 @@ long populate_vma_page_range(struct vm_area_struct *vma, int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; + unsigned long end, nstart, nend = 0L; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; + unsigned long nstart_prev = 0L - 1L, nend_prev = 0L - 1L; + int ign; end = start + len; + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d ENTER\n", start, len, end, ignore_errors); + for (nstart = start; nstart < end; nstart = nend) { + ign = nstart == nstart_prev && nend == nend_prev; + nstart_prev = nstart; + nend_prev = nend; + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP %lx %d %ld\n", start, len, end, ignore_errors, nstart, locked, ret); /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. @@ -1259,6 +1269,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) vma = find_vma(mm, nstart); } else if (nstart >= vma->vm_end) vma = vma->vm_next; + if (!ign && vma) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, len, end, ignore_errors, vma->vm_start, vma->vm_end, vma->vm_flags); if (!vma || vma->vm_start >= end) break; /* @@ -1266,8 +1278,13 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d nend=%lx %lx %lx\n", start, len, end, ignore_errors, nend, end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-1 %lx\n", start, len, end, ignore_errors, vma->vm_flags); + continue; + } if (nstart < vma->vm_start) nstart = vma->vm_start; /* @@ -1277,6 +1294,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-2 %ld\n", start, len, end, ignore_errors, ret); if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ @@ -1284,8 +1303,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) break; } nend = nstart + ret * PAGE_SIZE; + if (!ign) + printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-3 ret=%ld nend=%lx\n", start, len, end, ignore_errors, ret, nend); ret = 0; } + printk(KERN_WARNING "_mm_populate END %lu %lu %d\n", start, len, locked); if (locked) up_read(&mm->mmap_sem); return ret; /* 0 or negative error code */
Attachment:
dmesg-out.txt.gz
Description: application/gzip