> > Was lucky to grab that system again. Compiled a custom 6.0 kernel, whereby I printk all vmap allocation errors, including the range similarly to what you suggested above (but printk only on the failure path). > > So these are the failing allocations: > > # dmesg | grep " -> alloc" > [ 168.862511] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.863020] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.863841] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.864562] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.864646] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.865688] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.865718] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.866098] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.866551] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.866752] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867147] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867210] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867312] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867650] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867767] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867815] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.867815] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.868059] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.868463] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.868822] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.868919] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.869843] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.869854] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.870174] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.870611] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.870806] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.870982] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 168.879000] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.449101] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.449834] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.450667] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.451539] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.452326] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.453239] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.454052] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.454697] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.454811] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.455575] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.455754] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.461450] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.805223] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.805507] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.929577] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.930389] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.931244] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.932035] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.932796] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.933592] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.934470] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.935344] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 169.970641] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.191600] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.191875] -> alloc 40960 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.241901] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.242708] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.243465] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.244211] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.245060] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.245868] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.246433] -> alloc 40960 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.246657] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.247451] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.248226] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.248902] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.249704] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.250497] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.251244] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.252076] -> alloc 319488 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.587168] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 170.598995] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 171.865721] -> alloc 2506752 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > [ 172.138557] -> alloc 917504 size, align: 4096, vstart: 18446744072639352832, vend: 18446744073692774400 > OK. It is related to a module vmap space allocation when a module is inserted. I wounder why it requires 2.5MB for a module? It seems a lot to me. > > Really looks like only module vmap space. ~ 1 GiB of vmap module space ... > If an allocation request for a module is 2.5MB we can load ~400 modules having 1GB address space. "lsmod | wc -l"? How many modules your system has? > I did try: > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index dd6cdb201195..199154a2228a 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -72,6 +72,8 @@ early_param("nohugevmalloc", set_nohugevmalloc); > static const bool vmap_allow_huge = false; > #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ > +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); > + > bool is_vmalloc_addr(const void *x) > { > unsigned long addr = (unsigned long)kasan_reset_tag(x); > @@ -1574,7 +1576,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, > struct vmap_area *va; > unsigned long freed; > unsigned long addr; > - int purged = 0; > int ret; > BUG_ON(!size); > @@ -1631,23 +1632,22 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, > return va; > overflow: > - if (!purged) { > + if (atomic_long_read(&vmap_lazy_nr)) { > purge_vmap_area_lazy(); > - purged = 1; > goto retry; > } > freed = 0; > blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); > - if (freed > 0) { > - purged = 0; > + if (freed > 0) > goto retry; > - } > - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) > + if (!(gfp_mask & __GFP_NOWARN)) { > pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", > size); > + printk("-> alloc %lu size, align: %lu, vstart: %lu, vend: %lu\n", size, align, vstart, vend); > + } > kmem_cache_free(vmap_area_cachep, va); > return ERR_PTR(-EBUSY); > @@ -1690,8 +1690,6 @@ static unsigned long lazy_max_pages(void) > return log * (32UL * 1024 * 1024 / PAGE_SIZE); > } > -static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); > - > > > But that didn't help at all. That system is crazy: > If an allocation fails, the next step is to drain outstanding vmap areas. So a caller does it from its context and then repeat one more time and only after that a fail message is printed. > > # lspci | wc -l > 1117 > So probably you need a lot of modules in order to fully make functional your HW :) > > What I find interesting is that we have these recurring allocations of similar sizes failing. > I wonder if user space is capable of loading the same kernel module concurrently to > trigger a massive amount of allocations, and module loading code only figures out > later that it has already been loaded and backs off. > If there is a request about allocating memory it has to be succeeded unless there are some errors like no space no memory. > > My best guess would be that module loading is serialized completely, but for some reason, > something seems to go wrong with a lot of concurrency ... > lazy_max_pages() depends on number of online CPUs. Probably something related... I wrote a small patch to dump a modules address space when a fail occurs: <snip v6.0> diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 83b54beb12fa..88d323310df5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1580,6 +1580,37 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) kmem_cache_free(vmap_area_cachep, va); } +static void +dump_modules_free_space(unsigned long vstart, unsigned long vend) +{ + unsigned long va_start, va_end; + unsigned int total = 0; + struct vmap_area *va; + + if (vend != MODULES_END) + return; + + trace_printk("--- Dump a modules address space: 0x%lx - 0x%lx\n", vstart, vend); + + spin_lock(&free_vmap_area_lock); + list_for_each_entry(va, &free_vmap_area_list, list) { + va_start = (va->va_start > vstart) ? va->va_start:vstart; + va_end = (va->va_end < vend) ? va->va_end:vend; + + if (va_start >= va_end) + continue; + + if (va_start >= vstart && va_end <= vend) { + trace_printk(" va_free: 0x%lx - 0x%lx size=%lu\n", + va_start, va_end, va_end - va_start); + total += (va_end - va_start); + } + } + + spin_unlock(&free_vmap_area_lock); + trace_printk("--- Total free: %u ---\n", total); +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1663,10 +1694,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, goto retry; } - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); + dump_modules_free_space(); + } + kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } <snip> it would be good to understand whether we are really run out of space? Adding a print of lazy_max_pages() and vmap_lazy_nr would be also good. -- Uladzislau Rezki