The patch titled Subject: fs/proc: task_mmu.c: don't read mapcount for migration entry has been added to the -mm tree. Its filename is fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry.patch This patch should soon appear at https://ozlabs.org/~akpm/mmots/broken-out/fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry.patch and later at https://ozlabs.org/~akpm/mmotm/broken-out/fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Yang Shi <shy828301@xxxxxxxxx> Subject: fs/proc: task_mmu.c: don't read mapcount for migration entry syzbot reported the below BUG: kernel BUG at include/linux/page-flags.h:785! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00 RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003 RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001 FS: 00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> page_mapcount include/linux/mm.h:837 [inline] smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466 smaps_pte_entry fs/proc/task_mmu.c:538 [inline] smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379 walk_page_vma+0x277/0x350 mm/pagewalk.c:530 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768 smap_gather_stats fs/proc/task_mmu.c:741 [inline] show_smap+0xc6/0x440 fs/proc/task_mmu.c:822 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272 seq_read+0x3e0/0x5b0 fs/seq_file.c:162 vfs_read+0x1b5/0x600 fs/read_write.c:479 ksys_read+0x12d/0x250 fs/read_write.c:619 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7faa2af6c969 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969 RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000 R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000 </TASK> Modules linked in: ---[ end trace 24ec93ff95e4ac3d ]--- RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00 RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003 RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001 FS: 00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 The reproducer was trying to reading /proc/$PID/smaps when calling MADV_FREE at the same time. MADV_FREE may split THPs if it is called for partial THP. It may trigger the below race: CPU A CPU B ----- ----- smaps walk: MADV_FREE: page_mapcount() PageCompound() split_huge_page() page = compound_head(page) PageDoubleMap(page) When calling PageDoubleMap() this page is not a tail page of THP anymore so the BUG is triggered. This could be fixed by elevated refcount of the page before calling mapcount, but it prevents from counting migration entries, and it seems overkilling because the race just could happen when PMD is split so all PTE entries of tail pages are actually migration entries, and smaps_account() does treat migration entries as mapcount == 1 as Kirill pointed out. Add a new parameter for smaps_account() to tell this entry is migration entry then skip calling page_mapcount(). Don't skip getting mapcount for device private entries since they do track references with mapcount. Link: https://lkml.kernel.org/r/20220120202805.3369-1-shy828301@xxxxxxxxx Fixes: b1d4d9e0cbd0 ("proc/smaps: carefully handle migration entries") Signed-off-by: Yang Shi <shy828301@xxxxxxxxx> Reported-by: syzbot+1f52b3a18d5633fa7f82@xxxxxxxxxxxxxxxxxxxxxxxxx Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Jann Horn <jannh@xxxxxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> Cc: Alexey Dobriyan <adobriyan@xxxxxxxxx> Cc: <stable@xxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- fs/proc/task_mmu.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) --- a/fs/proc/task_mmu.c~fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry +++ a/fs/proc/task_mmu.c @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct } static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty, bool locked) + bool compound, bool young, bool dirty, bool locked, + bool migration) { int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -467,8 +468,12 @@ static void smaps_account(struct mem_siz * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). + * + * Treated regular migration entries as mapcount == 1 without reading + * mapcount since calling page_mapcount() for migration entries is + * racy against THP splitting. */ - if (page_count(page) == 1) { + if ((page_count(page) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; @@ -517,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); @@ -536,8 +542,11 @@ static void smaps_pte_entry(pte_t *pte, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; page = pfn_swap_entry_to_page(swpent); + } } else { smaps_pte_hole_lookup(addr, walk); return; @@ -546,7 +555,8 @@ static void smaps_pte_entry(pte_t *pte, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), + locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -557,6 +567,7 @@ static void smaps_pmd_entry(pmd_t *pmd, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pmd_present(*pmd)) { /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -564,8 +575,10 @@ static void smaps_pmd_entry(pmd_t *pmd, } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) + if (is_migration_entry(entry)) { + migration = true; page = pfn_swap_entry_to_page(entry); + } } if (IS_ERR_OR_NULL(page)) return; @@ -577,7 +590,9 @@ static void smaps_pmd_entry(pmd_t *pmd, /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, _ Patches currently in -mm which might be from shy828301@xxxxxxxxx are fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry.patch