A trivial execve scalability test which tries to be very friendly (statically linked binaries, all separate) is predominantly bottlenecked by back-to-back per-cpu counter allocations which serialize on global locks. Ease the pain by allocating and freeing them in one go. Bench can be found here: http://apollo.backplane.com/DFlyMisc/doexec.c $ cc -static -O2 -o static-doexec doexec.c $ ./static-doexec $(nproc) Even at a very modest scale of 26 cores (ops/s): before: 133543.63 after: 186061.81 (+39%) While with the patch these allocations remain a significant problem, the primary bottleneck shifts to: __pv_queued_spin_lock_slowpath+1 _raw_spin_lock_irqsave+57 folio_lruvec_lock_irqsave+91 release_pages+590 tlb_batch_pages_flush+61 tlb_finish_mmu+101 exit_mmap+327 __mmput+61 begin_new_exec+1245 load_elf_binary+712 bprm_execve+644 do_execveat_common.isra.0+429 __x64_sys_execve+50 do_syscall_64+46 entry_SYSCALL_64_after_hwframe+110 Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx> --- kernel/fork.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d2e12b6d2b18..86ff78e001c1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { - int i; - BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); - for (i = 0; i < NR_MM_COUNTERS; i++) - percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1252,7 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - int i; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); @@ -1301,17 +1297,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; - for (i = 0; i < NR_MM_COUNTERS; i++) - if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) - goto fail_pcpu; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) + goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: - while (i > 0) - percpu_counter_destroy(&mm->rss_stat[--i]); mm_destroy_cid(mm); fail_cid: destroy_context(mm); -- 2.39.2