On Wed, Aug 07, 2024 at 11:45:56AM GMT, Pengfei Xu wrote: > Hi Lorenzo Stoakes, > > Greetings! > > I used syzkaller and found > KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. > > Bisected the first bad commit: > 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c Hi, Thanks for this, I will investigate as a priority and work up a fix. As it's simply a refactor I suspect this should be relatively straightforward. > > All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ctx > Syzkaller repro code: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.c > Syzkaller repro syscall steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.prog > Syzkaller analysis report: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.report > Kconfig(make olddefconfig): https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/kconfig_origin > Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/bisect_info.log > Dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/d6dbc9f56c3a70e915625b6f1887882c23dc5c91_dmesg.log > bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240806_122723_userfaultfd_set_ctx/bzImage_d6dbc9f56c3a70e915625b6f1887882c23dc5c91.tar.gz > > " > [ 29.675551] ================================================================== > [ 29.676133] BUG: KASAN: slab-use-after-free in userfaultfd_set_ctx+0x31c/0x360 > [ 29.676716] Read of size 8 at addr ffff888027c5f100 by task repro/1498 > [ 29.677218] > [ 29.677358] CPU: 0 UID: 0 PID: 1498 Comm: repro Not tainted 6.11.0-rc2-next-20240805-d6dbc9f56c3a #1 > [ 29.678053] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 > [ 29.678910] Call Trace: > [ 29.679117] <TASK> > [ 29.679296] dump_stack_lvl+0xea/0x150 > [ 29.679622] print_report+0xce/0x610 > [ 29.679924] ? userfaultfd_set_ctx+0x31c/0x360 > [ 29.680289] ? kasan_complete_mode_report_info+0x80/0x200 > [ 29.680716] ? userfaultfd_set_ctx+0x31c/0x360 > [ 29.681077] kasan_report+0xcc/0x110 > [ 29.681372] ? userfaultfd_set_ctx+0x31c/0x360 > [ 29.681729] __asan_report_load8_noabort+0x18/0x20 > [ 29.682118] userfaultfd_set_ctx+0x31c/0x360 > [ 29.682465] userfaultfd_clear_vma+0x104/0x190 > [ 29.682826] userfaultfd_release_all+0x294/0x4a0 > [ 29.683201] ? __pfx_userfaultfd_release_all+0x10/0x10 > [ 29.683615] ? __this_cpu_preempt_check+0x21/0x30 > [ 29.684003] ? __pfx_userfaultfd_release+0x10/0x10 > [ 29.684389] userfaultfd_release+0x112/0x1e0 > [ 29.684735] ? __pfx_userfaultfd_release+0x10/0x10 > [ 29.685114] ? evm_file_release+0x193/0x1f0 > [ 29.685454] __fput+0x426/0xbc0 > [ 29.685719] ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30 > [ 29.686153] __fput_sync+0x58/0x70 > [ 29.686435] __x64_sys_close+0x93/0x120 > [ 29.686744] x64_sys_call+0x189a/0x20d0 > [ 29.687066] do_syscall_64+0x6d/0x140 > [ 29.687371] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 29.687779] RIP: 0033:0x7f3b67b3f247 > [ 29.688078] Code: ff e8 cd e3 01 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 c3 c9 f5 ff > [ 29.689519] RSP: 002b:00007ffd1f0ac7d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 > [ 29.690140] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3b67b3f247 > [ 29.690717] RDX: 0000000020000100 RSI: 000000008010aa01 RDI: 0000000000000003 > [ 29.691270] RBP: 00007ffd1f0ac7f0 R08: 00007ffd1f0ac7f0 R09: 00007ffd1f0ac7f0 > [ 29.691822] R10: 00007ffd1f0ac7f0 R11: 0000000000000246 R12: 00007ffd1f0ac968 > [ 29.692375] R13: 0000000000401bf9 R14: 0000000000403e08 R15: 00007f3b67c72000 > [ 29.692939] </TASK> > [ 29.693124] > [ 29.693258] Allocated by task 1498: > [ 29.693545] kasan_save_stack+0x2c/0x60 > [ 29.693875] kasan_save_track+0x18/0x40 > [ 29.694205] kasan_save_alloc_info+0x3c/0x50 > [ 29.694576] __kasan_slab_alloc+0x62/0x80 > [ 29.694921] kmem_cache_alloc_noprof+0x114/0x370 > [ 29.695319] vm_area_dup+0x2a/0x1b0 > [ 29.695630] __split_vma+0x188/0x1020 > [ 29.695952] vma_modify+0x1fc/0x390 > [ 29.696250] userfaultfd_clear_vma+0xd4/0x190 > [ 29.696609] userfaultfd_ioctl+0x3c0b/0x4560 > [ 29.696964] __x64_sys_ioctl+0x1b9/0x230 > [ 29.697295] x64_sys_call+0x1209/0x20d0 > [ 29.697620] do_syscall_64+0x6d/0x140 > [ 29.697937] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 29.698364] > [ 29.698508] Freed by task 1505: > [ 29.698779] kasan_save_stack+0x2c/0x60 > [ 29.699110] kasan_save_track+0x18/0x40 > [ 29.699441] kasan_save_free_info+0x3f/0x60 > [ 29.699797] __kasan_slab_free+0x47/0x60 > [ 29.700137] kmem_cache_free+0x2f2/0x4b0 > [ 29.700471] vm_area_free_rcu_cb+0x7f/0xa0 > [ 29.700819] rcu_core+0x877/0x18f0 > [ 29.701123] rcu_core_si+0x12/0x20 > [ 29.701421] handle_softirqs+0x1c7/0x870 > [ 29.701760] __irq_exit_rcu+0xa9/0x120 > [ 29.702082] irq_exit_rcu+0x12/0x30 > [ 29.702386] sysvec_apic_timer_interrupt+0xa5/0xc0 > [ 29.702802] asm_sysvec_apic_timer_interrupt+0x1f/0x30 > [ 29.703237] > [ 29.703377] Last potentially related work creation: > [ 29.703782] kasan_save_stack+0x2c/0x60 > [ 29.704114] __kasan_record_aux_stack+0x93/0xb0 > [ 29.704503] kasan_record_aux_stack_noalloc+0xf/0x20 > [ 29.704923] __call_rcu_common.constprop.0+0x72/0x6b0 > [ 29.705349] call_rcu+0x12/0x20 > [ 29.705625] vm_area_free+0x26/0x30 > [ 29.705928] vma_complete+0x57e/0xf60 > [ 29.706245] vma_merge+0x166b/0x3540 > [ 29.706555] vma_modify+0x9f/0x390 > [ 29.706853] userfaultfd_clear_vma+0xd4/0x190 > [ 29.707227] userfaultfd_release_all+0x294/0x4a0 > [ 29.707621] userfaultfd_release+0x112/0x1e0 > [ 29.707991] __fput+0x426/0xbc0 > [ 29.708267] __fput_sync+0x58/0x70 > [ 29.708563] __x64_sys_close+0x93/0x120 > [ 29.708891] x64_sys_call+0x189a/0x20d0 > [ 29.709220] do_syscall_64+0x6d/0x140 > [ 29.709538] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 29.709965] > [ 29.710105] The buggy address belongs to the object at ffff888027c5f0f0 > [ 29.710105] which belongs to the cache vm_area_struct of size 176 > [ 29.711130] The buggy address is located 16 bytes inside of > [ 29.711130] freed 176-byte region [ffff888027c5f0f0, ffff888027c5f1a0) > [ 29.712104] > [ 29.712245] The buggy address belongs to the physical page: > [ 29.712703] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x27c5f > [ 29.713349] memcg:ffff8880198eaa01 > [ 29.713639] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) > [ 29.714173] page_type: 0xfdffffff(slab) > [ 29.714507] raw: 000fffffc0000000 ffff88800d319dc0 dead000000000122 0000000000000000 > [ 29.715137] raw: 0000000000000000 0000000000110011 00000001fdffffff ffff8880198eaa01 > [ 29.715765] page dumped because: kasan: bad access detected > [ 29.716220] > [ 29.716360] Memory state around the buggy address: > [ 29.716756] ffff888027c5f000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > [ 29.717349] ffff888027c5f080: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fa fb > [ 29.717940] >ffff888027c5f100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > [ 29.718521] ^ > [ 29.718796] ffff888027c5f180: fb fb fb fb fc fc fc fc fc fc fc fc 00 00 00 00 > [ 29.719388] ffff888027c5f200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > [ 29.720025] ================================================================== > [ 29.720671] Disabling lock debugging due to kernel taint > " > > Thanks! > > --- > > If you don't need the following environment to reproduce the problem or if you > already have one reproduced environment, please ignore the following information. > > How to reproduce: > git clone https://gitlab.com/xupengfe/repro_vm_env.git > cd repro_vm_env > tar -xvf repro_vm_env.tar.gz > cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 > // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel > // You could change the bzImage_xxx as you want > // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version > You could use below command to log in, there is no password for root. > ssh -p 10023 root@localhost > > After login vm(virtual machine) successfully, you could transfer reproduced > binary to the vm by below way, and reproduce the problem in vm: > gcc -pthread -o repro repro.c > scp -P 10023 repro root@localhost:/root/ > > Get the bzImage for target kernel: > Please use target kconfig and copy it to kernel_src/.config > make olddefconfig > make -jx bzImage //x should equal or less than cpu num your pc has > > Fill the bzImage file into above start3.sh to load the target kernel in vm. > > > Tips: > If you already have qemu-system-x86_64, please ignore below info. > If you want to install qemu v7.1.0 version: > git clone https://github.com/qemu/qemu.git > cd qemu > git checkout -f v7.1.0 > mkdir build > cd build > yum install -y ninja-build.x86_64 > yum -y install libslirp-devel.x86_64 > ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp > make > make install > > Best Regards, > Thanks! > > > On 2024-07-29 at 12:50:35 +0100, Lorenzo Stoakes wrote: > > This patch forms part of a patch series intending to separate out VMA logic > > and render it testable from userspace, which requires that core > > manipulation functions be exposed in an mm/-internal header file. > > > > In order to do this, we must abstract APIs we wish to test, in this > > instance functions which ultimately invoke vma_modify(). > > > > This patch therefore moves all logic which ultimately invokes vma_modify() > > to mm/userfaultfd.c, trying to transfer code at a functional granularity > > where possible. > > > > Reviewed-by: Vlastimil Babka <vbabka@xxxxxxx> > > Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > > --- > > fs/userfaultfd.c | 160 +++----------------------------- > > include/linux/userfaultfd_k.h | 19 ++++ > > mm/userfaultfd.c | 168 ++++++++++++++++++++++++++++++++++ > > 3 files changed, 198 insertions(+), 149 deletions(-) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index 27a3e9285fbf..b3ed7207df7e 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) > > return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; > > } > > > > -static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, > > - vm_flags_t flags) > > -{ > > - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; > > - > > - vm_flags_reset(vma, flags); > > - /* > > - * For shared mappings, we want to enable writenotify while > > - * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply > > - * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. > > - */ > > - if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) > > - vma_set_page_prot(vma); > > -} > > - > > static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, > > int wake_flags, void *key) > > { > > @@ -615,22 +600,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > > spin_unlock_irq(&ctx->event_wqh.lock); > > > > if (release_new_ctx) { > > - struct vm_area_struct *vma; > > - struct mm_struct *mm = release_new_ctx->mm; > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - /* the various vma->vm_userfaultfd_ctx still points to it */ > > - mmap_write_lock(mm); > > - for_each_vma(vmi, vma) { > > - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > > - vma_start_write(vma); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - userfaultfd_set_vm_flags(vma, > > - vma->vm_flags & ~__VM_UFFD_FLAGS); > > - } > > - } > > - mmap_write_unlock(mm); > > - > > + userfaultfd_release_new(release_new_ctx); > > userfaultfd_ctx_put(release_new_ctx); > > } > > > > @@ -662,9 +632,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > > return 0; > > > > if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { > > - vma_start_write(vma); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); > > + userfaultfd_reset_ctx(vma); > > return 0; > > } > > > > @@ -749,9 +717,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > > up_write(&ctx->map_changing_lock); > > } else { > > /* Drop uffd context if remap feature not enabled */ > > - vma_start_write(vma); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); > > + userfaultfd_reset_ctx(vma); > > } > > } > > > > @@ -870,53 +836,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > > { > > struct userfaultfd_ctx *ctx = file->private_data; > > struct mm_struct *mm = ctx->mm; > > - struct vm_area_struct *vma, *prev; > > /* len == 0 means wake all */ > > struct userfaultfd_wake_range range = { .len = 0, }; > > - unsigned long new_flags; > > - VMA_ITERATOR(vmi, mm, 0); > > > > WRITE_ONCE(ctx->released, true); > > > > - if (!mmget_not_zero(mm)) > > - goto wakeup; > > - > > - /* > > - * Flush page faults out of all CPUs. NOTE: all page faults > > - * must be retried without returning VM_FAULT_SIGBUS if > > - * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx > > - * changes while handle_userfault released the mmap_lock. So > > - * it's critical that released is set to true (above), before > > - * taking the mmap_lock for writing. > > - */ > > - mmap_write_lock(mm); > > - prev = NULL; > > - for_each_vma(vmi, vma) { > > - cond_resched(); > > - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ > > - !!(vma->vm_flags & __VM_UFFD_FLAGS)); > > - if (vma->vm_userfaultfd_ctx.ctx != ctx) { > > - prev = vma; > > - continue; > > - } > > - /* Reset ptes for the whole vma range if wr-protected */ > > - if (userfaultfd_wp(vma)) > > - uffd_wp_range(vma, vma->vm_start, > > - vma->vm_end - vma->vm_start, false); > > - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; > > - vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, > > - vma->vm_end, new_flags, > > - NULL_VM_UFFD_CTX); > > - > > - vma_start_write(vma); > > - userfaultfd_set_vm_flags(vma, new_flags); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > + userfaultfd_release_all(mm, ctx); > > > > - prev = vma; > > - } > > - mmap_write_unlock(mm); > > - mmput(mm); > > -wakeup: > > /* > > * After no new page faults can wait on this fault_*wqh, flush > > * the last page faults that may have been already waiting on > > @@ -1293,14 +1219,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > unsigned long arg) > > { > > struct mm_struct *mm = ctx->mm; > > - struct vm_area_struct *vma, *prev, *cur; > > + struct vm_area_struct *vma, *cur; > > int ret; > > struct uffdio_register uffdio_register; > > struct uffdio_register __user *user_uffdio_register; > > - unsigned long vm_flags, new_flags; > > + unsigned long vm_flags; > > bool found; > > bool basic_ioctls; > > - unsigned long start, end, vma_end; > > + unsigned long start, end; > > struct vma_iterator vmi; > > bool wp_async = userfaultfd_wp_async_ctx(ctx); > > > > @@ -1428,57 +1354,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > } for_each_vma_range(vmi, cur, end); > > BUG_ON(!found); > > > > - vma_iter_set(&vmi, start); > > - prev = vma_prev(&vmi); > > - if (vma->vm_start < start) > > - prev = vma; > > - > > - ret = 0; > > - for_each_vma_range(vmi, vma, end) { > > - cond_resched(); > > - > > - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); > > - BUG_ON(vma->vm_userfaultfd_ctx.ctx && > > - vma->vm_userfaultfd_ctx.ctx != ctx); > > - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > > - > > - /* > > - * Nothing to do: this vma is already registered into this > > - * userfaultfd and with the right tracking mode too. > > - */ > > - if (vma->vm_userfaultfd_ctx.ctx == ctx && > > - (vma->vm_flags & vm_flags) == vm_flags) > > - goto skip; > > - > > - if (vma->vm_start > start) > > - start = vma->vm_start; > > - vma_end = min(end, vma->vm_end); > > - > > - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; > > - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > > - new_flags, > > - (struct vm_userfaultfd_ctx){ctx}); > > - if (IS_ERR(vma)) { > > - ret = PTR_ERR(vma); > > - break; > > - } > > - > > - /* > > - * In the vma_merge() successful mprotect-like case 8: > > - * the next vma was merged into the current one and > > - * the current one has not been updated yet. > > - */ > > - vma_start_write(vma); > > - userfaultfd_set_vm_flags(vma, new_flags); > > - vma->vm_userfaultfd_ctx.ctx = ctx; > > - > > - if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > > - hugetlb_unshare_all_pmds(vma); > > - > > - skip: > > - prev = vma; > > - start = vma->vm_end; > > - } > > + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, > > + wp_async); > > > > out_unlock: > > mmap_write_unlock(mm); > > @@ -1519,7 +1396,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > struct vm_area_struct *vma, *prev, *cur; > > int ret; > > struct uffdio_range uffdio_unregister; > > - unsigned long new_flags; > > bool found; > > unsigned long start, end, vma_end; > > const void __user *buf = (void __user *)arg; > > @@ -1622,27 +1498,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); > > } > > > > - /* Reset ptes for the whole vma range if wr-protected */ > > - if (userfaultfd_wp(vma)) > > - uffd_wp_range(vma, start, vma_end - start, false); > > - > > - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; > > - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > > - new_flags, NULL_VM_UFFD_CTX); > > + vma = userfaultfd_clear_vma(&vmi, prev, vma, > > + start, vma_end); > > if (IS_ERR(vma)) { > > ret = PTR_ERR(vma); > > break; > > } > > > > - /* > > - * In the vma_merge() successful mprotect-like case 8: > > - * the next vma was merged into the current one and > > - * the current one has not been updated yet. > > - */ > > - vma_start_write(vma); > > - userfaultfd_set_vm_flags(vma, new_flags); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - > > skip: > > prev = vma; > > start = vma->vm_end; > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > index a12bcf042551..9fc6ce15c499 100644 > > --- a/include/linux/userfaultfd_k.h > > +++ b/include/linux/userfaultfd_k.h > > @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, > > extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); > > extern bool userfaultfd_wp_async(struct vm_area_struct *vma); > > > > +void userfaultfd_reset_ctx(struct vm_area_struct *vma); > > + > > +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, > > + unsigned long end); > > + > > +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, > > + struct vm_area_struct *vma, > > + unsigned long vm_flags, > > + unsigned long start, unsigned long end, > > + bool wp_async); > > + > > +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); > > + > > +void userfaultfd_release_all(struct mm_struct *mm, > > + struct userfaultfd_ctx *ctx); > > + > > #else /* CONFIG_USERFAULTFD */ > > > > /* mm helpers */ > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index e54e5c8907fa..3b7715ecf292 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -1760,3 +1760,171 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, > > VM_WARN_ON(!moved && !err); > > return moved ? moved : err; > > } > > + > > +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, > > + vm_flags_t flags) > > +{ > > + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; > > + > > + vm_flags_reset(vma, flags); > > + /* > > + * For shared mappings, we want to enable writenotify while > > + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply > > + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. > > + */ > > + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) > > + vma_set_page_prot(vma); > > +} > > + > > +static void userfaultfd_set_ctx(struct vm_area_struct *vma, > > + struct userfaultfd_ctx *ctx, > > + unsigned long flags) > > +{ > > + vma_start_write(vma); > > + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; > > + userfaultfd_set_vm_flags(vma, > > + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); > > +} > > + > > +void userfaultfd_reset_ctx(struct vm_area_struct *vma) > > +{ > > + userfaultfd_set_ctx(vma, NULL, 0); > > +} > > + > > +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, > > + unsigned long end) > > +{ > > + struct vm_area_struct *ret; > > + > > + /* Reset ptes for the whole vma range if wr-protected */ > > + if (userfaultfd_wp(vma)) > > + uffd_wp_range(vma, start, end - start, false); > > + > > + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, > > + vma->vm_flags & ~__VM_UFFD_FLAGS, > > + NULL_VM_UFFD_CTX); > > + > > + /* > > + * In the vma_merge() successful mprotect-like case 8: > > + * the next vma was merged into the current one and > > + * the current one has not been updated yet. > > + */ > > + if (!IS_ERR(ret)) > > + userfaultfd_reset_ctx(vma); > > + > > + return ret; > > +} > > + > > +/* Assumes mmap write lock taken, and mm_struct pinned. */ > > +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, > > + struct vm_area_struct *vma, > > + unsigned long vm_flags, > > + unsigned long start, unsigned long end, > > + bool wp_async) > > +{ > > + VMA_ITERATOR(vmi, ctx->mm, start); > > + struct vm_area_struct *prev = vma_prev(&vmi); > > + unsigned long vma_end; > > + unsigned long new_flags; > > + > > + if (vma->vm_start < start) > > + prev = vma; > > + > > + for_each_vma_range(vmi, vma, end) { > > + cond_resched(); > > + > > + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); > > + BUG_ON(vma->vm_userfaultfd_ctx.ctx && > > + vma->vm_userfaultfd_ctx.ctx != ctx); > > + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > > + > > + /* > > + * Nothing to do: this vma is already registered into this > > + * userfaultfd and with the right tracking mode too. > > + */ > > + if (vma->vm_userfaultfd_ctx.ctx == ctx && > > + (vma->vm_flags & vm_flags) == vm_flags) > > + goto skip; > > + > > + if (vma->vm_start > start) > > + start = vma->vm_start; > > + vma_end = min(end, vma->vm_end); > > + > > + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; > > + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > > + new_flags, > > + (struct vm_userfaultfd_ctx){ctx}); > > + if (IS_ERR(vma)) > > + return PTR_ERR(vma); > > + > > + /* > > + * In the vma_merge() successful mprotect-like case 8: > > + * the next vma was merged into the current one and > > + * the current one has not been updated yet. > > + */ > > + userfaultfd_set_ctx(vma, ctx, vm_flags); > > + > > + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > > + hugetlb_unshare_all_pmds(vma); > > + > > +skip: > > + prev = vma; > > + start = vma->vm_end; > > + } > > + > > + return 0; > > +} > > + > > +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) > > +{ > > + struct mm_struct *mm = ctx->mm; > > + struct vm_area_struct *vma; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + /* the various vma->vm_userfaultfd_ctx still points to it */ > > + mmap_write_lock(mm); > > + for_each_vma(vmi, vma) { > > + if (vma->vm_userfaultfd_ctx.ctx == ctx) > > + userfaultfd_reset_ctx(vma); > > + } > > + mmap_write_unlock(mm); > > +} > > + > > +void userfaultfd_release_all(struct mm_struct *mm, > > + struct userfaultfd_ctx *ctx) > > +{ > > + struct vm_area_struct *vma, *prev; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + if (!mmget_not_zero(mm)) > > + return; > > + > > + /* > > + * Flush page faults out of all CPUs. NOTE: all page faults > > + * must be retried without returning VM_FAULT_SIGBUS if > > + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx > > + * changes while handle_userfault released the mmap_lock. So > > + * it's critical that released is set to true (above), before > > + * taking the mmap_lock for writing. > > + */ > > + mmap_write_lock(mm); > > + prev = NULL; > > + for_each_vma(vmi, vma) { > > + cond_resched(); > > + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ > > + !!(vma->vm_flags & __VM_UFFD_FLAGS)); > > + if (vma->vm_userfaultfd_ctx.ctx != ctx) { > > + prev = vma; > > + continue; > > + } > > + > > + vma = userfaultfd_clear_vma(&vmi, prev, vma, > > + vma->vm_start, vma->vm_end); > > + prev = vma; > > + } > > + mmap_write_unlock(mm); > > + mmput(mm); > > +} > > -- > > 2.45.2 > >