Hi Matthew, On 27/02/2024 19:23, Matthew Wilcox (Oracle) wrote: > Turn __dump_page() into a wrapper around __dump_folio(). Snapshot the > page & folio into a stack variable so we don't hit BUG_ON() if an > allocation is freed under us and what was a folio pointer becomes a > pointer to a tail page. I'm seeing a couple of panics caused by this patch. I already raised the first one at [1], and it looks like there is a bug in Ard's patch (which he now has a proposed fix for) which provokes the bug in this. [1] https://lore.kernel.org/linux-arm-kernel/fc691e8d-1a50-4be6-a3b2-d60d6f2e2487@xxxxxxx/ The other way to trigger it is to run the mm kselftest: gup_test -ct -F 0x1 0 19 0x1000 This calls into the kernel and deliberately calls dump_page() via dump_pages_test() in gup_test.c. Panic is as follows (root cause identified below): [ 22.994800] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 [ 22.995428] Mem abort info: [ 22.995617] ESR = 0x0000000096000005 [ 22.995867] EC = 0x25: DABT (current EL), IL = 32 bits [ 22.996215] SET = 0, FnV = 0 [ 22.996419] EA = 0, S1PTW = 0 [ 22.996628] FSC = 0x05: level 1 translation fault [ 22.996951] Data abort info: [ 22.997145] ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 [ 22.997541] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 22.998025] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 22.998438] user pgtable: 4k pages, 39-bit VAs, pgdp=000000019f2d6000 [ 22.998937] [0000000000000008] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 [ 22.999608] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP [ 23.000083] Modules linked in: [ 23.000319] CPU: 6 PID: 1222 Comm: gup_test Not tainted 6.8.0-rc6-00915-g7f43e0f76e47 #2 [ 23.000883] Hardware name: linux,dummy-virt (DT) [ 23.001209] pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 23.001621] pc : get_pfnblock_flags_mask+0x3c/0x68 [ 23.001929] lr : __dump_page+0x188/0x400 [ 23.002168] sp : ffffffc0885ebb40 [ 23.002370] x29: ffffffc0885ebb40 x28: 0000000000ffffc0 x27: 0000000000000000 [ 23.002839] x26: 0000000000000000 x25: ffffffc0885ebba0 x24: 00000000ffffffff [ 23.003335] x23: ffffffeabbbc1000 x22: 0000000000000030 x21: ffffffeabb9f5e98 [ 23.003869] x20: ffffffc0885ebba0 x19: ffffffc0885ebba0 x18: ffffffffffffffff [ 23.004489] x17: 34383833383a6465 x16: 7070616d5f736567 x15: ffffffeabd058782 [ 23.004989] x14: 0000000000000000 x13: 3030303730303039 x12: 3138666666666666 [ 23.005501] x11: fffffffffffe0000 x10: ffffffeabca9c018 x9 : ffffffeab9f0c798 [ 23.005980] x8 : 00000000ffffefff x7 : ffffffeabca9c018 x6 : 0000000000000000 [ 23.006448] x5 : 000003fffffffc28 x4 : 0001fffffffe144a x3 : 0000000000000000 [ 23.006931] x2 : 0000000000000007 x1 : ffffffff0a257aee x0 : 00000000001fffff [ 23.007436] Call trace: [ 23.007623] get_pfnblock_flags_mask+0x3c/0x68 [ 23.007928] dump_page+0x2c/0x70 [ 23.008156] gup_test_ioctl+0xb34/0xc40 [ 23.008416] __arm64_sys_ioctl+0xb0/0x100 [ 23.008694] invoke_syscall+0x50/0x128 [ 23.008944] el0_svc_common.constprop.0+0x48/0xf8 [ 23.009259] do_el0_svc+0x28/0x40 [ 23.009499] el0_svc+0x34/0xb8 [ 23.009720] el0t_64_sync_handler+0x13c/0x158 [ 23.010029] el0t_64_sync+0x190/0x198 [ 23.010293] Code: d37b1884 f100007f 8b040064 9a831083 (f9400460) [ 23.010714] ---[ end trace 0000000000000000 ]--- > > Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> > --- > mm/debug.c | 120 +++++++++++++++++++++++++++++------------------------ > 1 file changed, 66 insertions(+), 54 deletions(-) > > diff --git a/mm/debug.c b/mm/debug.c > index ee533a5ceb79..96555fc78f1a 100644 > --- a/mm/debug.c > +++ b/mm/debug.c > @@ -51,84 +51,96 @@ const struct trace_print_flags vmaflag_names[] = { > {0, NULL} > }; > > -static void __dump_page(struct page *page) > +static void __dump_folio(struct folio *folio, struct page *page, > + unsigned long pfn, unsigned long idx) > { > - struct folio *folio = page_folio(page); > - struct page *head = &folio->page; > - struct address_space *mapping; > - bool compound = PageCompound(page); > - /* > - * Accessing the pageblock without the zone lock. It could change to > - * "isolate" again in the meantime, but since we are just dumping the > - * state for debugging, it should be fine to accept a bit of > - * inaccuracy here due to racing. > - */ > - bool page_cma = is_migrate_cma_page(page); > - int mapcount; > + struct address_space *mapping = folio_mapping(folio); > + bool page_cma; > + int mapcount = 0; > char *type = ""; > > - if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) { > - /* > - * Corrupt page, so we cannot call page_mapping. Instead, do a > - * safe subset of the steps that page_mapping() does. Caution: > - * this will be misleading for tail pages, PageSwapCache pages, > - * and potentially other situations. (See the page_mapping() > - * implementation for what's missing here.) > - */ > - unsigned long tmp = (unsigned long)page->mapping; > - > - if (tmp & PAGE_MAPPING_ANON) > - mapping = NULL; > - else > - mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); > - head = page; > - folio = (struct folio *)page; > - compound = false; > - } else { > - mapping = page_mapping(page); > - } > - > /* > - * Avoid VM_BUG_ON() in page_mapcount(). > - * page->_mapcount space in struct page is used by sl[aou]b pages to > - * encode own info. > + * page->_mapcount space in struct page is used by slab pages to > + * encode own info, and we must avoid calling page_folio() again. > */ > - mapcount = PageSlab(head) ? 0 : page_mapcount(page); > - > - pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", > - page, page_ref_count(head), mapcount, mapping, > - page_to_pgoff(page), page_to_pfn(page)); > - if (compound) { > - pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", > - head, compound_order(head), > + if (!folio_test_slab(folio)) { > + mapcount = atomic_read(&page->_mapcount) + 1; > + if (folio_test_large(folio)) > + mapcount += folio_entire_mapcount(folio); > + } > + > + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", > + folio_ref_count(folio), mapcount, mapping, > + folio->index + idx, pfn); > + if (folio_test_large(folio)) { > + pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", > + folio_order(folio), > folio_entire_mapcount(folio), > folio_nr_pages_mapped(folio), > atomic_read(&folio->_pincount)); > } > > #ifdef CONFIG_MEMCG > - if (head->memcg_data) > - pr_warn("memcg:%lx\n", head->memcg_data); > + if (folio->memcg_data) > + pr_warn("memcg:%lx\n", folio->memcg_data); > #endif > - if (PageKsm(page)) > + if (folio_test_ksm(folio)) > type = "ksm "; > - else if (PageAnon(page)) > + else if (folio_test_anon(folio)) > type = "anon "; > else if (mapping) > dump_mapping(mapping); > BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); > > - pr_warn("%sflags: %pGp%s\n", type, &head->flags, > + /* > + * Accessing the pageblock without the zone lock. It could change to > + * "isolate" again in the meantime, but since we are just dumping the > + * state for debugging, it should be fine to accept a bit of > + * inaccuracy here due to racing. > + */ > + page_cma = is_migrate_cma_page(page); Problem is here: is_migrate_cma_page() is a macro that resolves to this: page_cma = get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) == MIGRATE_CMA; And since page is on the stack, page_to_pfn() gives a very wrong answer. I confirmed that the problem goes away for both cases above, when changing the line to: page_cma = get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK) == MIGRATE_CMA; Thanks, Ryan > + pr_warn("%sflags: %pGp%s\n", type, &folio->flags, > page_cma ? " CMA" : ""); > - pr_warn("page_type: %pGt\n", &head->page_type); > + pr_warn("page_type: %pGt\n", &folio->page.page_type); > > print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, > sizeof(unsigned long), page, > sizeof(struct page), false); > - if (head != page) > + if (folio_test_large(folio)) > print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32, > - sizeof(unsigned long), head, > - sizeof(struct page), false); > + sizeof(unsigned long), folio, > + 2 * sizeof(struct page), false); > +} > + > +static void __dump_page(const struct page *page) > +{ > + struct folio *foliop, folio; > + struct page precise; > + unsigned long pfn = page_to_pfn(page); > + unsigned long idx, nr_pages = 1; > + int loops = 5; > + > +again: > + memcpy(&precise, page, sizeof(*page)); > + foliop = page_folio(&precise); > + idx = folio_page_idx(foliop, page); > + if (idx != 0) { > + if (idx < (1UL << PUD_ORDER)) { > + memcpy(&folio, foliop, 2 * sizeof(struct page)); > + nr_pages = folio_nr_pages(&folio); > + } > + > + if (idx > nr_pages) { > + if (loops-- > 0) > + goto again; > + printk("page does not match folio\n"); > + precise.compound_head &= ~1UL; > + foliop = (struct folio *)&precise; > + idx = 0; > + } > + } > + > + __dump_folio(foliop, &precise, pfn, idx); > } > > void dump_page(struct page *page, const char *reason)