On 12/04/2023 06:00, Aneesh Kumar K.V wrote: > commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound > devmaps") added support for using optimized vmmemap for devdax devices. But how > vmemmap mappings are created are architecture specific. For example, powerpc > with hash translation doesn't have vmemmap mappings in init_mm page table > instead they are bolted table entries in the hardware page table > > vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware > of these architecture-specific mapping. Hence allow architecture to opt for this > feature. Architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP option are > selected when supporting this feature. > > On ppc64 (pmem) where this isn't supported, it fixes below crash:' > > BUG: Unable to handle kernel data access on write at 0xc00c000100400038 > Faulting instruction address: 0xc000000001269d90 > Oops: Kernel access of bad area, sig: 11 [#1] > LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries > Modules linked in: > CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a > Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries > NIP: c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000 > REGS: c000000003632c30 TRAP: 0300 Not tainted (6.3.0-rc5-150500.34-default+) > MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 24842228 XER: 00000000 > CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0 > .... > NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c > LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0 > Call Trace: > [c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable) > [c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250 > [c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0 > [c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0 > [c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0 > [c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140 > [c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520 > [c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230 > [c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120 > [c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0 > [c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130 > [c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250 > [c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110 > [c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10 > [c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530 > [c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270 > [c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70 > [c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0 > [c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520 > [c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230 > [c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120 > [c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240 > [c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130 > [c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50 > [c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300 > [c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0 > [c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100 > [c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48 > [c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320 > [c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400 > [c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0 > [c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64 > > Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps") > Cc: Joao Martins <joao.m.martins@xxxxxxxxxx> > Cc: Muchun Song <songmuchun@xxxxxxxxxxxxx> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx> > Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> > Reported-by: Tarun Sahu <tsahu@xxxxxxxxxxxxx> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> Reviewed-by: Joao Martins <joao.m.martins@xxxxxxxxxx> > --- > include/linux/mm.h | 16 ++++++++++++++++ > mm/page_alloc.c | 11 +++++++---- > mm/sparse-vmemmap.c | 3 +-- > 3 files changed, 24 insertions(+), 6 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1f79667824eb..ced82b9c18e5 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -3425,6 +3425,22 @@ void vmemmap_populate_print_last(void); > void vmemmap_free(unsigned long start, unsigned long end, > struct vmem_altmap *altmap); > #endif > + > +#ifdef CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP > +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, > + struct dev_pagemap *pgmap) > +{ > + return is_power_of_2(sizeof(struct page)) && > + pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap; > +} > +#else > +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, > + struct dev_pagemap *pgmap) > +{ > + return false; > +} > +#endif > + > void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, > unsigned long nr_pages); > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 7136c36c5d01..cf9f9ddfbd19 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -6889,10 +6889,13 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, > * of an altmap. See vmemmap_populate_compound_pages(). > */ > static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, > - unsigned long nr_pages) > + struct dev_pagemap *pgmap) > { > - return is_power_of_2(sizeof(struct page)) && > - !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages; > + > + if (!vmemmap_can_optimize(altmap, pgmap)) > + return pgmap_vmemmap_nr(pgmap); > + > + return 2 * (PAGE_SIZE / sizeof(struct page)); > } > > static void __ref memmap_init_compound(struct page *head, > @@ -6957,7 +6960,7 @@ void __ref memmap_init_zone_device(struct zone *zone, > continue; > > memmap_init_compound(page, pfn, zone_idx, nid, pgmap, > - compound_nr_pages(altmap, pfns_per_compound)); > + compound_nr_pages(altmap, pgmap)); > } > > pr_info("%s initialised %lu pages in %ums\n", __func__, > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > index c5398a5960d0..10d73a0dfcec 100644 > --- a/mm/sparse-vmemmap.c > +++ b/mm/sparse-vmemmap.c > @@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, > !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION))) > return NULL; > > - if (is_power_of_2(sizeof(struct page)) && > - pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap) > + if (vmemmap_can_optimize(altmap, pgmap)) > r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap); > else > r = vmemmap_populate(start, end, nid, altmap);