Hi Ryan, On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote: > This allows batching the rmap removal with folio_remove_rmap_range(), > which means we avoid spuriously adding a partially unmapped folio to the > deferred split queue in the common case, which reduces split queue lock > contention. > > Previously each page was removed from the rmap individually with > page_remove_rmap(). If the first page belonged to a large folio, this > would cause page_remove_rmap() to conclude that the folio was now > partially mapped and add the folio to the deferred split queue. But > subsequent calls would cause the folio to become fully unmapped, meaning > there is no value to adding it to the split queue. > > Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx> > --- > mm/memory.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 120 insertions(+) > > diff --git a/mm/memory.c b/mm/memory.c > index 01f39e8144ef..189b1cfd823d 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1391,6 +1391,94 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, > pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); > } > > +static inline unsigned long page_cont_mapped_vaddr(struct page *page, > + struct page *anchor, unsigned long anchor_vaddr) > +{ > + unsigned long offset; > + unsigned long vaddr; > + > + offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT; > + vaddr = anchor_vaddr + offset; > + > + if (anchor > page) { > + if (vaddr > anchor_vaddr) > + return 0; > + } else { > + if (vaddr < anchor_vaddr) > + return ULONG_MAX; > + } > + > + return vaddr; > +} > + > +static int folio_nr_pages_cont_mapped(struct folio *folio, > + struct page *page, pte_t *pte, > + unsigned long addr, unsigned long end) > +{ > + pte_t ptent; > + int floops; > + int i; > + unsigned long pfn; > + struct page *folio_end; > + > + if (!folio_test_large(folio)) > + return 1; > + > + folio_end = &folio->page + folio_nr_pages(folio); > + end = min(page_cont_mapped_vaddr(folio_end, page, addr), end); > + floops = (end - addr) >> PAGE_SHIFT; > + pfn = page_to_pfn(page); > + pfn++; > + pte++; > + > + for (i = 1; i < floops; i++) { > + ptent = ptep_get(pte); > + > + if (!pte_present(ptent) || pte_pfn(ptent) != pfn) > + break; > + > + pfn++; > + pte++; > + } > + > + return i; > +} > + > +static unsigned long try_zap_anon_pte_range(struct mmu_gather *tlb, > + struct vm_area_struct *vma, > + struct folio *folio, > + struct page *page, pte_t *pte, > + unsigned long addr, int nr_pages, > + struct zap_details *details) > +{ > + struct mm_struct *mm = tlb->mm; > + pte_t ptent; > + bool full; > + int i; > + > + for (i = 0; i < nr_pages;) { > + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); > + tlb_remove_tlb_entry(tlb, pte, addr); > + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); > + full = __tlb_remove_page(tlb, page, 0); > + > + if (unlikely(page_mapcount(page) < 1)) > + print_bad_pte(vma, addr, ptent, page); > + > + i++; > + page++; > + pte++; > + addr += PAGE_SIZE; > + > + if (unlikely(full)) > + break; > + } > + > + folio_remove_rmap_range(folio, page - i, i, vma); > + > + return i; > +} > + > static unsigned long zap_pte_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, pmd_t *pmd, > unsigned long addr, unsigned long end, > @@ -1428,6 +1516,38 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, > page = vm_normal_page(vma, addr, ptent); > if (unlikely(!should_zap_page(details, page))) > continue; > + > + /* > + * Batch zap large anonymous folio mappings. This allows > + * batching the rmap removal, which means we avoid > + * spuriously adding a partially unmapped folio to the > + * deferrred split queue in the common case, which > + * reduces split queue lock contention. > + */ > + if (page && PageAnon(page)) { > + struct folio *folio = page_folio(page); > + int nr_pages_req, nr_pages; > + > + nr_pages_req = folio_nr_pages_cont_mapped( > + folio, page, pte, addr, end); > + > + nr_pages = try_zap_anon_pte_range(tlb, vma, > + folio, page, pte, addr, > + nr_pages_req, details); > + > + rss[mm_counter(page)] -= nr_pages; > + nr_pages--; > + pte += nr_pages; > + addr += nr_pages << PAGE_SHIFT; > + > + if (unlikely(nr_pages < nr_pages_req)) { > + force_flush = 1; > + addr += PAGE_SIZE; > + break; > + } > + continue; > + } > + > ptent = ptep_get_and_clear_full(mm, addr, pte, > tlb->fullmm); > tlb_remove_tlb_entry(tlb, pte, addr); > -- > 2.25.1 > After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large anonymous folio PTE mappings"), I see the following splats several times when booting Debian's s390x configuration (which I have mirrored at [1]) in QEMU (bisect log below): $ qemu-system-s390x \ -display none \ -nodefaults \ -M s390-ccw-virtio \ -kernel arch/s390/boot/bzImage \ -initrd rootfs.cpio \ -m 512m \ -serial mon:stdio KASLR disabled: CPU has no PRNG KASLR disabled: CPU has no PRNG [ 2.502282] Linux version 6.5.0-rc3+ (nathan@dev-arch.thelio-3990X) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 09:14:20 MST 2023 ... [ 3.406011] Freeing initrd memory: 7004K [ 3.492739] BUG: Bad page state in process modprobe pfn:01b18 [ 3.492909] page:00000000233d9f2f refcount:0 mapcount:1 mapping:0000000000000000 index:0xdb pfn:0x1b18 [ 3.492998] flags: 0xa0004(uptodate|mappedtodisk|swapbacked|zone=0) [ 3.493195] page_type: 0x0() [ 3.493457] raw: 00000000000a0004 0000000000000100 0000000000000122 0000000000000000 [ 3.493492] raw: 00000000000000db 0000000000000000 0000000000000000 0000000000000000 [ 3.493525] page dumped because: nonzero mapcount [ 3.493549] Modules linked in: [ 3.493719] CPU: 0 PID: 38 Comm: modprobe Not tainted 6.5.0-rc3+ #1 [ 3.493814] Hardware name: QEMU 8561 QEMU (KVM/Linux) [ 3.493892] Call Trace: [ 3.494117] [<0000000000add35a>] dump_stack_lvl+0x62/0x88 [ 3.494333] [<00000000003d565a>] bad_page+0x8a/0x130 [ 3.494355] [<00000000003d6728>] free_unref_page_prepare+0x268/0x3d8 [ 3.494375] [<00000000003d9408>] free_unref_page+0x48/0x140 [ 3.494394] [<00000000003ad99c>] unmap_page_range+0x924/0x1388 [ 3.494412] [<00000000003ae54c>] unmap_vmas+0x14c/0x200 [ 3.494429] [<00000000003be2f2>] exit_mmap+0xba/0x3a0 [ 3.494447] [<0000000000147000>] __mmput+0x50/0x180 [ 3.494466] [<0000000000152a00>] do_exit+0x320/0xb40 [ 3.494484] [<0000000000153450>] do_group_exit+0x40/0xb8 [ 3.494502] [<00000000001534f6>] __s390x_sys_exit_group+0x2e/0x30 [ 3.494520] [<0000000000b05080>] __do_syscall+0x1e8/0x210 [ 3.494539] [<0000000000b15970>] system_call+0x70/0x98 [ 3.494663] Disabling lock debugging due to kernel taint [ 3.494809] BUG: Bad page map in process modprobe pte:01b1831f pmd:1fff9000 [ 3.494833] page:00000000233d9f2f refcount:0 mapcount:0 mapping:0000000000000000 index:0xdb pfn:0x1b18 [ 3.494852] flags: 0xa0004(uptodate|mappedtodisk|swapbacked|zone=0) [ 3.494866] page_type: 0xffffffff() [ 3.494882] raw: 00000000000a0004 0000000000000100 0000000000000122 0000000000000000 [ 3.494898] raw: 00000000000000db 0000000000000000 ffffffff00000000 0000000000000000 [ 3.494908] page dumped because: bad pte [ 3.494923] addr:000002aa1d75c000 vm_flags:08100071 anon_vma:000000001fffc340 mapping:000000000286d6b8 index:db [ 3.494983] file:busybox fault:shmem_fault mmap:shmem_mmap read_folio:0x0 [ 3.495247] CPU: 0 PID: 38 Comm: modprobe Tainted: G B 6.5.0-rc3+ #1 [ 3.495267] Hardware name: QEMU 8561 QEMU (KVM/Linux) [ 3.495277] Call Trace: [ 3.495285] [<0000000000add35a>] dump_stack_lvl+0x62/0x88 [ 3.495307] [<00000000003ab30e>] print_bad_pte+0x176/0x2c8 [ 3.495324] [<00000000003ae098>] unmap_page_range+0x1020/0x1388 [ 3.495341] [<00000000003ae54c>] unmap_vmas+0x14c/0x200 [ 3.495357] [<00000000003be2f2>] exit_mmap+0xba/0x3a0 [ 3.495375] [<0000000000147000>] __mmput+0x50/0x180 [ 3.495394] [<0000000000152a00>] do_exit+0x320/0xb40 [ 3.495411] [<0000000000153450>] do_group_exit+0x40/0xb8 [ 3.495429] [<00000000001534f6>] __s390x_sys_exit_group+0x2e/0x30 [ 3.495447] [<0000000000b05080>] __do_syscall+0x1e8/0x210 [ 3.495465] [<0000000000b15970>] system_call+0x70/0x98 ... The rootfs is available at [2] if it is relevant. I am happy to provide any additional information or test patches as necessary. Cheers, Nathan [1]: https://github.com/nathanchance/llvm-kernel-testing/blob/79aa4ab2edc595979366c427cb49f477c7f31c68/configs/debian/s390x.config [2]: https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst # bad: [0ba5d07205771c50789fd9063950aa75e7f1183f] Add linux-next specific files for 20230726 # good: [18b44bc5a67275641fb26f2c54ba7eef80ac5950] ovl: Always reevaluate the file signature for IMA git bisect start '0ba5d07205771c50789fd9063950aa75e7f1183f' '18b44bc5a67275641fb26f2c54ba7eef80ac5950' # bad: [8fe1b33ece8f8fe1377082e839817886cb8c0f81] Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git git bisect bad 8fe1b33ece8f8fe1377082e839817886cb8c0f81 # bad: [932bd67958459da3dc755b5bea7758e9d951dee5] Merge branch 'ti-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git git bisect bad 932bd67958459da3dc755b5bea7758e9d951dee5 # bad: [a4abec0a3653fb9dfb3ea6cea2ad1d36f507ca97] Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git git bisect bad a4abec0a3653fb9dfb3ea6cea2ad1d36f507ca97 # bad: [5a52022bde252d090e051077af297dcfeff9fd0d] powerpc/book3s64/radix: add debug message to give more details of vmemmap allocation git bisect bad 5a52022bde252d090e051077af297dcfeff9fd0d # good: [671115657ee2403d18cb849061d7245687d9fdc5] mm/pgtable: notes on pte_offset_map[_lock]() git bisect good 671115657ee2403d18cb849061d7245687d9fdc5 # good: [26c3a4fe0eb027ff00ad42168c8732db0c0b40d7] arm64/smmu: use TLBI ASID when invalidating entire range git bisect good 26c3a4fe0eb027ff00ad42168c8732db0c0b40d7 # bad: [8585d0b53780f11cad8dad37997369949e3d5043] mm: memcg: use rstat for non-hierarchical stats git bisect bad 8585d0b53780f11cad8dad37997369949e3d5043 # bad: [9abfe35eb187c3f79af5bb07c2f9815a480c4965] mm/compaction: correct comment of candidate pfn in fast_isolate_freepages git bisect bad 9abfe35eb187c3f79af5bb07c2f9815a480c4965 # bad: [208f64c37a4e22b25b8037776c5713545eaf54fa] selftests: line buffer test program's stdout git bisect bad 208f64c37a4e22b25b8037776c5713545eaf54fa # good: [08356142587c28b86817646ff318317b5237fdeb] mmu_notifiers: rename invalidate_range notifier git bisect good 08356142587c28b86817646ff318317b5237fdeb # good: [652555287069f2c0bbbfaf262eb41638f5c87550] mm: allow deferred splitting of arbitrary large anon folios git bisect good 652555287069f2c0bbbfaf262eb41638f5c87550 # bad: [904d9713b3b0e64329b2f6d159966b5c737444ff] mm: batch-zap large anonymous folio PTE mappings git bisect bad 904d9713b3b0e64329b2f6d159966b5c737444ff # good: [9a7c14665a566fbc1adc2c35982898abc1546525] mm: implement folio_remove_rmap_range() git bisect good 9a7c14665a566fbc1adc2c35982898abc1546525 # first bad commit: [904d9713b3b0e64329b2f6d159966b5c737444ff] mm: batch-zap large anonymous folio PTE mappings