hi, Yosry Ahmed, On Mon, Oct 23, 2023 at 07:13:50PM -0700, Yosry Ahmed wrote: ... > > I still could not run the benchmark, but I used a version of > fallocate1.c that does 1 million iterations. I ran 100 in parallel. > This showed ~13% regression with the patch, so not the same as the > will-it-scale version, but it could be an indicator. > > With that, I did not see any improvement with the fixlet above or > ___cacheline_aligned_in_smp. So you can scratch that. > > I did, however, see some improvement with reducing the indirection > layers by moving stats_updates directly into struct mem_cgroup. The > regression in my manual testing went down to 9%. Still not great, but > I am wondering how this reflects on the benchmark. If you're able to > test it that would be great, the diff is below. Meanwhile I am still > looking for other improvements that can be made. we applied previous patch-set as below: c5f50d8b23c79 (linux-review/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231010-112257) mm: memcg: restore subtree stats flushing ac8a48ba9e1ca mm: workingset: move the stats flush into workingset_test_recent() 51d74c18a9c61 mm: memcg: make stats flushing threshold per-memcg 130617edc1cd1 mm: memcg: move vmstats structs definition above flushing code 26d0ee342efc6 mm: memcg: change flush_next_time to flush_last_time 25478183883e6 Merge branch 'mm-nonmm-unstable' into mm-everything <---- the base our tool picked for the patch set I tried to apply below patch to either 51d74c18a9c61 or c5f50d8b23c79, but failed. could you guide how to apply this patch? Thanks > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index f64ac140083e..b4dfcd8b9cc1 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -270,6 +270,9 @@ struct mem_cgroup { > > CACHELINE_PADDING(_pad1_); > > + /* Stats updates since the last flush */ > + atomic64_t stats_updates; > + > /* memory.stat */ > struct memcg_vmstats *vmstats; > > @@ -309,6 +312,7 @@ struct mem_cgroup { > atomic_t moving_account; > struct task_struct *move_lock_task; > > + unsigned int __percpu *stats_updates_percpu; > struct memcg_vmstats_percpu __percpu *vmstats_percpu; > > #ifdef CONFIG_CGROUP_WRITEBACK > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 7cbc7d94eb65..e5d2f3d4d874 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -627,9 +627,6 @@ struct memcg_vmstats_percpu { > /* Cgroup1: threshold notifications & softlimit tree updates */ > unsigned long nr_page_events; > unsigned long targets[MEM_CGROUP_NTARGETS]; > - > - /* Stats updates since the last flush */ > - unsigned int stats_updates; > }; > > struct memcg_vmstats { > @@ -644,9 +641,6 @@ struct memcg_vmstats { > /* Pending child counts during tree propagation */ > long state_pending[MEMCG_NR_STAT]; > unsigned long events_pending[NR_MEMCG_EVENTS]; > - > - /* Stats updates since the last flush */ > - atomic64_t stats_updates; > }; > > /* > @@ -695,14 +689,14 @@ static void memcg_stats_unlock(void) > > static bool memcg_should_flush_stats(struct mem_cgroup *memcg) > { > - return atomic64_read(&memcg->vmstats->stats_updates) > > + return atomic64_read(&memcg->stats_updates) > > MEMCG_CHARGE_BATCH * num_online_cpus(); > } > > static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) > { > int cpu = smp_processor_id(); > - unsigned int x; > + unsigned int *stats_updates_percpu; > > if (!val) > return; > @@ -710,10 +704,10 @@ static inline void memcg_rstat_updated(struct > mem_cgroup *memcg, int val) > cgroup_rstat_updated(memcg->css.cgroup, cpu); > > for (; memcg; memcg = parent_mem_cgroup(memcg)) { > - x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, > - abs(val)); > + stats_updates_percpu = > this_cpu_ptr(memcg->stats_updates_percpu); > > - if (x < MEMCG_CHARGE_BATCH) > + *stats_updates_percpu += abs(val); > + if (*stats_updates_percpu < MEMCG_CHARGE_BATCH) > continue; > > /* > @@ -721,8 +715,8 @@ static inline void memcg_rstat_updated(struct > mem_cgroup *memcg, int val) > * redundant. Avoid the overhead of the atomic update. > */ > if (!memcg_should_flush_stats(memcg)) > - atomic64_add(x, &memcg->vmstats->stats_updates); > - __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); > + atomic64_add(*stats_updates_percpu, > &memcg->stats_updates); > + *stats_updates_percpu = 0; > } > } > > @@ -5467,6 +5461,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) > free_mem_cgroup_per_node_info(memcg, node); > kfree(memcg->vmstats); > free_percpu(memcg->vmstats_percpu); > + free_percpu(memcg->stats_updates_percpu); > kfree(memcg); > } > > @@ -5504,6 +5499,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) > if (!memcg->vmstats_percpu) > goto fail; > > + memcg->stats_updates_percpu = alloc_percpu_gfp(unsigned int, > + GFP_KERNEL_ACCOUNT); > + if (!memcg->stats_updates_percpu) > + goto fail; > + > for_each_node(node) > if (alloc_mem_cgroup_per_node_info(memcg, node)) > goto fail; > @@ -5735,10 +5735,12 @@ static void mem_cgroup_css_rstat_flush(struct > cgroup_subsys_state *css, int cpu) > struct mem_cgroup *memcg = mem_cgroup_from_css(css); > struct mem_cgroup *parent = parent_mem_cgroup(memcg); > struct memcg_vmstats_percpu *statc; > + int *stats_updates_percpu; > long delta, delta_cpu, v; > int i, nid; > > statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); > + stats_updates_percpu = per_cpu_ptr(memcg->stats_updates_percpu, cpu); > > for (i = 0; i < MEMCG_NR_STAT; i++) { > /* > @@ -5826,10 +5828,10 @@ static void mem_cgroup_css_rstat_flush(struct > cgroup_subsys_state *css, int cpu) > } > } > } > - statc->stats_updates = 0; > + *stats_updates_percpu = 0; > /* We are in a per-cpu loop here, only do the atomic write once */ > - if (atomic64_read(&memcg->vmstats->stats_updates)) > - atomic64_set(&memcg->vmstats->stats_updates, 0); > + if (atomic64_read(&memcg->stats_updates)) > + atomic64_set(&memcg->stats_updates, 0); > } > > #ifdef CONFIG_MMU >