commit 18b098af2890 ("vma_merge: set vma iterator to correct position.") added a vma_prev(vmi) call to vma_merge() at a point where it's still possible to bail out. My understanding is that this moves the VMA iterator back by one VMA. If you patch some extra logging into the kernel and inject a fake out-of-memory error at the vma_iter_prealloc() call in vma_split() (a real out-of-memory error there is very unlikely to happen in practice, I think - my understanding is that the kernel will basically kill every process on the system except for init before it starts failing GFP_KERNEL allocations that fit within a single slab, unless the allocation uses GFP_ACCOUNT or stuff like that, which the maple tree doesn't): ``` diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7cecd49e078b..a7be4d6a5db6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1454,9 +1454,16 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma; ret = 0; + if (strcmp(current->comm, "FAILME") == 0) + pr_warn("%s: begin vma iteration\n", __func__); for_each_vma_range(vmi, vma, end) { cond_resched(); + if (strcmp(current->comm, "FAILME") == 0) { + pr_warn("%s: prev=%px, vma=%px (%016lx-%016lx)\n", + __func__, prev, vma, vma->vm_start, vma->vm_end); + } + BUG_ON(!vma_can_userfault(vma, vm_flags)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); @@ -1481,6 +1488,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); + if (strcmp(current->comm, "FAILME") == 0) + pr_warn("%s: vma_merge returned %px\n", __func__, prev); if (prev) { /* vma_merge() invalidated the mas */ vma = prev; diff --git a/mm/mmap.c b/mm/mmap.c index 3937479d0e07..fd435c40c43d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -990,7 +990,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (err) return NULL; - if (vma_iter_prealloc(vmi)) + if (strcmp(current->comm, "FAILME")==0 || vma_iter_prealloc(vmi)) return NULL; init_multi_vma_prep(&vp, vma, adjust, remove, remove2); ``` and then you run this reproducer: ``` #define _GNU_SOURCE #include <err.h> #include <stdlib.h> #include <unistd.h> #include <sys/syscall.h> #include <sys/prctl.h> #include <sys/mman.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> #ifndef UFFD_USER_MODE_ONLY #define UFFD_USER_MODE_ONLY 1 #endif #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ }) int main(void) { int uffd = SYSCHK(syscall(__NR_userfaultfd, UFFD_USER_MODE_ONLY)); struct uffdio_api api = { .api = UFFD_API, .features = 0 }; SYSCHK(ioctl(uffd, UFFDIO_API, &api)); /* create vma1 */ SYSCHK(mmap((void*)0x100000UL, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0)); /* set uffd on vma1 */ struct uffdio_register reg1 = { .range = { .start = 0x100000, .len = 0x1000 }, .mode = UFFDIO_REGISTER_MODE_MISSING }; SYSCHK(ioctl(uffd, UFFDIO_REGISTER, ®1)); /* create vma2 */ SYSCHK(mmap((void*)0x101000UL, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0)); /* tries to merge vma2 into vma1, with injected allocation failure causing merge failure */ SYSCHK(prctl(PR_SET_NAME, "FAILME")); struct uffdio_register reg2 = { .range = { .start = 0x101000, .len = 0x1000 }, .mode = UFFDIO_REGISTER_MODE_MISSING }; SYSCHK(ioctl(uffd, UFFDIO_REGISTER, ®2)); SYSCHK(prctl(PR_SET_NAME, "normal")); } ``` then you'll get this fun log output, showing that the same VMA (ffff88810c0b5e00) was visited by two iterations of the VMA iteration loop, and on the second iteration, prev==vma: [ 326.765586] userfaultfd_register: begin vma iteration [ 326.766985] userfaultfd_register: prev=ffff88810c0b5ef0, vma=ffff88810c0b5e00 (0000000000101000-0000000000102000) [ 326.768786] userfaultfd_register: vma_merge returned 0000000000000000 [ 326.769898] userfaultfd_register: prev=ffff88810c0b5e00, vma=ffff88810c0b5e00 (0000000000101000-0000000000102000) I don't know if this can lead to anything bad but it seems pretty clearly unintended?