On Thu, Jan 28, 2016 at 12:40 PM, Kirill A. Shutemov <kirill@xxxxxxxxxxxxx> wrote: > On Thu, Jan 28, 2016 at 11:55:14AM +0100, Dmitry Vyukov wrote: >> On Thu, Jan 28, 2016 at 11:51 AM, Kirill A. Shutemov >> <kirill@xxxxxxxxxxxxx> wrote: >> > On Thu, Jan 28, 2016 at 11:27:11AM +0100, Dmitry Vyukov wrote: >> >> Hello, >> >> >> >> The following program triggers VM_BUG_ON_PAGE(PageTail(page)): >> >> >> >> // autogenerated by syzkaller (http://github.com/google/syzkaller) >> >> #include <fcntl.h> >> >> #include <numaif.h> >> >> #include <sys/mman.h> >> >> #include <unistd.h> >> >> >> >> int main() >> >> { >> >> int fd; >> >> >> >> mmap((void*)0x20000000, 4096, PROT_READ|PROT_WRITE, >> >> MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0); >> >> fd = open("/dev/sg1", O_RDONLY|O_SYNC|0x100000); >> >> mmap((void*)0x20001000, 0x4000, PROT_READ|PROT_WRITE, >> >> MAP_PRIVATE|MAP_FIXED, fd, 0); >> >> mbind((void*)0x20000000, 0x4000, 0x8002, (void*)0x20002ff8, 3660, >> >> MPOL_MF_STRICT|MPOL_MF_MOVE); >> >> return 0; >> >> } >> > >> > I don't have sg1 in my VM. I changed it to sg0 and it doesn't trigger an >> > issue: mbind() returns -EINVAL as it supposed to. Hm.. >> >> I've attached my config, and here is how I start qemu: >> >> qemu-system-x86_64 -hda wheezy.img -net >> user,host=10.0.2.10,hostfwd=tcp::10022-:22 -net nic -nographic -kernel >> arch/x86/boot/bzImage -append "console=ttyS0 root=/dev/sda debug >> earlyprintk=serial slub_debug=UZ" -enable-kvm -pidfile vm_pid -m 2G >> -numa node,nodeid=0,cpus=0-1 -numa node,nodeid=1,cpus=2-3 -smp >> sockets=2,cores=2,threads=1 -usb -usbdevice mouse -usbdevice tablet >> -soundhw all > > Still no luck. :-/ > > Could you try patch below. I want to see what vm_flags are. > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 27d135408a22..93edf181f88a 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -548,8 +548,10 @@ retry: > goto retry; > } > > - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) > + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { > + VM_BUG_ON_VMA(PageTail(page), vma); > migrate_page_add(page, qp->pagelist, flags); > + } > } > pte_unmap_unlock(pte - 1, ptl); > cond_resched(); Humm... now I cannot reproduce it with the original program as well. But I reproduced it with another program. Please try the one below. I've updated to 26cd83670f2f5a3d5b5514a1f7d96567cdb9558b and have few pending fixes to mm (see below) including your VM_BUG_ON_VMA change above. This report contains vm_flags: vma ffff880062082450 start 0000000020001000 end 0000000020005000 next ffff880030460a60 prev ffff880062083910 mm ffff88002f691380 prot 8000000000000025 anon_vma ffff880062a55180 vm_ops ffffffff86d5aac0 pgoff 0 file ffff880060b1f0c0 private_data ffff880061240000 flags: 0x4144073(read|write|mayread|maywrite|mayexec|io|dontexpand|account) ------------[ cut here ]------------ kernel BUG at mm/mempolicy.c:552! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN Modules linked in: CPU: 3 PID: 11434 Comm: tail Not tainted 4.5.0-rc1+ #301 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff88002ecddf00 ti: ffff880033ed0000 task.ti: ffff880033ed0000 RIP: 0010:[<ffffffff817502db>] [<ffffffff817502db>] queue_pages_pte_range+0x8ab/0x10f0 RSP: 0018:ffff880033ed7a98 EFLAGS: 00010296 RAX: 0000000000000001 RBX: ffff880061e77018 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffffed00067daf29 RBP: ffff880033ed7b10 R08: 0000000000000001 R09: 0000000000000000 R10: 1ffff1000c41048a R11: 0000000000000001 R12: 0000000020003000 R13: dffffc0000000000 R14: ffffea0001b4c280 R15: 0000000020004000 FS: 00007f670c480700(0000) GS:ffff88006d700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000020003000 CR3: 00000000004e7000 CR4: 00000000000006e0 Stack: ffffea0000000001 ffff880033ed7c70 ffffea0001b4c200 ffff880062020840 ffffed00067daf8e ffffea0001b4c2a0 0000000000000023 ffff880062082450 ffff880033ed7c60 ffff880032747800 ffffffff8174fa30 dffffc0000000000 Call Trace: [< inline >] walk_pmd_range mm/pagewalk.c:50 [< inline >] walk_pud_range mm/pagewalk.c:90 [< inline >] walk_pgd_range mm/pagewalk.c:116 [<ffffffff817209b3>] __walk_page_range+0x653/0xcd0 mm/pagewalk.c:204 [<ffffffff81721164>] walk_page_range+0x134/0x300 mm/pagewalk.c:281 [<ffffffff8174cd8b>] queue_pages_range+0xfb/0x130 mm/mempolicy.c:689 [<ffffffff81755611>] do_mbind+0x2c1/0xdc0 mm/mempolicy.c:1241 [< inline >] SYSC_mbind mm/mempolicy.c:1353 [<ffffffff8175646d>] SyS_mbind+0x13d/0x150 mm/mempolicy.c:1335 [<ffffffff86653276>] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 Code: 55 98 48 8d 42 ff e9 ce fa ff ff e8 00 9f e1 ff 4c 89 f7 e8 18 fc f2 ff e9 92 fe ff ff e8 ee 9e e1 ff 48 8b 7d c0 e8 85 87 f8 ff <0f> 0b e8 de 9e e1 ff 48 89 df 48 89 f8 0f 1f 40 00 48 89 c3 48 RIP [<ffffffff817502db>] queue_pages_pte_range+0x8ab/0x10f0 mm/mempolicy.c:552 RSP <ffff880033ed7a98> ---[ end trace 3f5635b07e2902a8 ]--- BUG: sleeping function called from invalid context at include/linux/sched.h:2805 in_atomic(): 1, irqs_disabled(): 0, pid: 11434, name: tail INFO: lockdep is turned off. CPU: 3 PID: 11434 Comm: tail Tainted: G D 4.5.0-rc1+ #301 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 00000000ffffffff ffff880033ed7588 ffffffff82be11cd ffff88002ecddf00 0000000000002caa 0000000000000000 ffff880033ed75b0 ffffffff813cb8cb ffff88002ecddf00 ffffffff867387a0 0000000000000af5 ffff880033ed75f0 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [<ffffffff82be11cd>] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [<ffffffff813cb8cb>] ___might_sleep+0x27b/0x3a0 kernel/sched/core.c:7703 [<ffffffff813cba80>] __might_sleep+0x90/0x1a0 kernel/sched/core.c:7665 [< inline >] threadgroup_change_begin include/linux/sched.h:2805 [<ffffffff813830d1>] exit_signals+0x81/0x430 kernel/signal.c:2392 [<ffffffff8135c3dc>] do_exit+0x23c/0x2cb0 kernel/exit.c:701 [<ffffffff811aa28f>] oops_end+0x9f/0xd0 arch/x86/kernel/dumpstack.c:250 [<ffffffff811aa686>] die+0x46/0x60 arch/x86/kernel/dumpstack.c:316 [< inline >] do_trap_no_signal arch/x86/kernel/traps.c:205 [<ffffffff811a3b9f>] do_trap+0x18f/0x380 arch/x86/kernel/traps.c:251 [<ffffffff811a400e>] do_error_trap+0x11e/0x280 arch/x86/kernel/traps.c:290 [<ffffffff811a527b>] do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:303 [<ffffffff86654f8e>] invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:830 [< inline >] walk_pmd_range mm/pagewalk.c:50 [< inline >] walk_pud_range mm/pagewalk.c:90 [< inline >] walk_pgd_range mm/pagewalk.c:116 [<ffffffff817209b3>] __walk_page_range+0x653/0xcd0 mm/pagewalk.c:204 [<ffffffff81721164>] walk_page_range+0x134/0x300 mm/pagewalk.c:281 [<ffffffff8174cd8b>] queue_pages_range+0xfb/0x130 mm/mempolicy.c:689 [<ffffffff81755611>] do_mbind+0x2c1/0xdc0 mm/mempolicy.c:1241 [< inline >] SYSC_mbind mm/mempolicy.c:1353 [<ffffffff8175646d>] SyS_mbind+0x13d/0x150 mm/mempolicy.c:1335 [<ffffffff86653276>] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 note: tail[11434] exited with preempt_count 1 // autogenerated by syzkaller (http://github.com/google/syzkaller) #include <pthread.h> #include <stdint.h> #include <string.h> #include <sys/syscall.h> #include <unistd.h> long r[12]; void* thr(void* arg) { switch ((long)arg) { case 0: r[0] = syscall(SYS_mmap, 0x20000000ul, 0x1000ul, 0x3ul, 0x32ul, 0xfffffffffffffffful, 0x0ul); break; case 1: r[2] = syscall(SYS_open, "/dev/sg1", 0x101000ul, 0); break; case 2: r[3] = syscall(SYS_mmap, 0x20001000ul, 0x4000ul, 0x3ul, 0x12ul, r[2], 0x0ul); break; case 3: *(uint64_t*)0x20002ff8 = (uint64_t)0xffffffff; r[5] = syscall(SYS_mbind, 0x20000000ul, 0x4000ul, 0x8002ul, 0x20002ff8ul, 0xe4cul, 0x3ul); break; case 4: r[6] = syscall(SYS_mmap, 0x20005000ul, 0x1000ul, 0x3ul, 0x32ul, 0xfffffffffffffffful, 0x0ul); break; case 5: memcpy((void*)0x20005ffd, "\x7b\x3a\x00", 3); memcpy((void*)0x200053e8, "\x70\x6f\x73\x69\x78\x5f\x61\x63\x6c\x5f" "\x61\x63\x63\x65\x73\x73\x70\x70\x70\x31" "\x6b\x65\x79\x72\x69\x6e\x67\x73\x65\x6c" "\x66\x65\x74\x68\x30\x2f\x5c\x00", 38); memcpy((void*)0x20000ffe, "\x73\x65\x63\x75\x72\x69\x74\x79\x00", 9); r[10] = syscall(SYS_request_key, 0x20005ffdul, 0x200053e8ul, 0x20000ffeul, 0xfffffffffffffffful, 0, 0); break; case 6: r[11] = syscall(SYS_keyctl, 0x11ul, r[10], 0x20003fbaul, 0xa9ul, 0, 0); break; } return 0; } int main() { long i; pthread_t th[7]; memset(r, -1, sizeof(r)); for (i = 0; i < 7; i++) { pthread_create(&th[i], 0, thr, (void*)i); usleep(10000); } for (i = 0; i < 7; i++) { pthread_create(&th[i], 0, thr, (void*)i); if (i % 2 == 0) usleep(10000); } usleep(100000); return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 27d1354..93edf18 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -548,8 +548,10 @@ retry: goto retry; } - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + VM_BUG_ON_VMA(PageTail(page), vma); migrate_page_add(page, qp->pagelist, flags); + } } pte_unmap_unlock(pte - 1, ptl); cond_resched(); diff --git a/mm/mmap.c b/mm/mmap.c index 84b1262..082b8a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -387,8 +387,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) } #ifdef CONFIG_DEBUG_VM_RB -static int browse_rb(struct rb_root *root) +static int browse_rb(struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; @@ -411,12 +412,14 @@ static int browse_rb(struct rb_root *root) vma->vm_start, vma->vm_end); bug = 1; } + spin_lock(&mm->page_table_lock); if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; } + spin_unlock(&mm->page_table_lock); i++; pn = nd; prev = vma->vm_start; @@ -453,12 +456,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -472,7 +479,7 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(&mm->mm_rb); + i = browse_rb(mm); if (i != mm->map_count) { if (i != -1) pr_emerg("map_count %d rb %d\n", mm->map_count, i); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>