On Wed, Aug 23, 2023 at 07:06:09AM +0200, Mateusz Guzik wrote: > A trivial execve scalability test which tries to be very friendly > (statically linked binaries, all separate) is predominantly bottlenecked > by back-to-back per-cpu counter allocations which serialize on global > locks. > > Ease the pain by allocating and freeing them in one go. > > Bench can be found here: > http://apollo.backplane.com/DFlyMisc/doexec.c > > $ cc -static -O2 -o static-doexec doexec.c > $ ./static-doexec $(nproc) > > Even at a very modest scale of 26 cores (ops/s): > before: 133543.63 > after: 186061.81 (+39%) > > While with the patch these allocations remain a significant problem, > the primary bottleneck shifts to page release handling. > > Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx> Same message as for 1/2. I'm happy with this, just a minor reflow. I'll take this for-6.6 unless there are other comments / objections to that. I'll run a few tests myself too tomorrow just for validation. Reviewed-by: Dennis Zhou <dennis@xxxxxxxxxx> Thanks, Dennis > --- > kernel/fork.c | 14 +++----------- > 1 file changed, 3 insertions(+), 11 deletions(-) > > diff --git a/kernel/fork.c b/kernel/fork.c > index d2e12b6d2b18..4f0ada33457e 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) > */ > void __mmdrop(struct mm_struct *mm) > { > - int i; > - > BUG_ON(mm == &init_mm); > WARN_ON_ONCE(mm == current->mm); > > @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm) > put_user_ns(mm->user_ns); > mm_pasid_drop(mm); > mm_destroy_cid(mm); > + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); > > - for (i = 0; i < NR_MM_COUNTERS; i++) > - percpu_counter_destroy(&mm->rss_stat[i]); > free_mm(mm); > } > EXPORT_SYMBOL_GPL(__mmdrop); > @@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) > static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > struct user_namespace *user_ns) > { > - int i; > - > mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); > mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); > atomic_set(&mm->mm_users, 1); > @@ -1301,17 +1296,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > if (mm_alloc_cid(mm)) > goto fail_cid; > > - for (i = 0; i < NR_MM_COUNTERS; i++) > - if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) > - goto fail_pcpu; > + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) > + goto fail_pcpu; > > mm->user_ns = get_user_ns(user_ns); > lru_gen_init_mm(mm); > return mm; > > fail_pcpu: > - while (i > 0) > - percpu_counter_destroy(&mm->rss_stat[--i]); > mm_destroy_cid(mm); > fail_cid: > destroy_context(mm); > -- > 2.41.0 >