On Wed, Mar 2, 2022 at 2:50 AM 'Shakeel Butt' via kernel-team+notifications <kernel-team@xxxxxxxxxxxxxx> wrote: > > On Tue, Mar 01, 2022 at 04:48:00PM -0800, Ivan Babrou wrote: > > [...] > > > Looks like you were right that targeted flush is not going to be as good. > > > Thanks a lot. Can you please try the following patch (independent of other > patches) as well? Hi Shakeel, maybe it's just me, but the codechange below confuses me. I'll comment inline. > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index d9b8df5ef212..499f75e066f3 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -75,29 +75,8 @@ enum mem_cgroup_events_target { > MEM_CGROUP_NTARGETS, > }; > > -struct memcg_vmstats_percpu { > - /* Local (CPU and cgroup) page state & events */ > - long state[MEMCG_NR_STAT]; > - unsigned long events[NR_VM_EVENT_ITEMS]; > - > - /* Delta calculation for lockless upward propagation */ > - long state_prev[MEMCG_NR_STAT]; > - unsigned long events_prev[NR_VM_EVENT_ITEMS]; > - > - /* Cgroup1: threshold notifications & softlimit tree updates */ > - unsigned long nr_page_events; > - unsigned long targets[MEM_CGROUP_NTARGETS]; > -}; > - > -struct memcg_vmstats { > - /* Aggregated (CPU and subtree) page state & events */ > - long state[MEMCG_NR_STAT]; > - unsigned long events[NR_VM_EVENT_ITEMS]; > - > - /* Pending child counts during tree propagation */ > - long state_pending[MEMCG_NR_STAT]; > - unsigned long events_pending[NR_VM_EVENT_ITEMS]; > -}; > +struct memcg_vmstats_percpu; > +struct memcg_vmstats; > > struct mem_cgroup_reclaim_iter { > struct mem_cgroup *position; > @@ -304,7 +283,7 @@ struct mem_cgroup { > MEMCG_PADDING(_pad1_); > > /* memory.stat */ > - struct memcg_vmstats vmstats; > + struct memcg_vmstats *vmstats; > > /* memory.events */ > atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; > @@ -964,11 +943,7 @@ static inline void mod_memcg_state(struct mem_cgroup > *memcg, > local_irq_restore(flags); > } > > -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int > idx) > -{ > - return READ_ONCE(memcg->vmstats.state[idx]); > -} > - > +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); > static inline unsigned long lruvec_page_state(struct lruvec *lruvec, > enum node_stat_item idx) > { > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 32ba963ebf2e..c65f155c2048 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -688,6 +688,71 @@ static void flush_memcg_stats_dwork(struct work_struct > *w) > queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); > } > > +static const unsigned int memcg_vm_events[] = { > + PGPGIN, > + PGPGOUT, > + PGFAULT, > + PGMAJFAULT, > + PGREFILL, > + PGSCAN_KSWAPD, > + PGSCAN_DIRECT, > + PGSTEAL_KSWAPD, > + PGSTEAL_DIRECT, > + PGACTIVATE, > + PGDEACTIVATE, > + PGLAZYFREE, > + PGLAZYFREED, > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + THP_FAULT_ALLOC, > + THP_COLLAPSE_ALLOC, > +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > +}; > +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_events) > + > +static int memcg_events_index[NR_VM_EVENT_ITEMS] __read_mostly; > + > +static void init_memcg_events(void) > +{ > + int i; > + > + for (i = 0; i < NR_MEMCG_EVENTS; ++i) > + memcg_events_index[memcg_vm_events[i]] = i + 1; > +} > + > +static int get_memcg_events_index(enum vm_event_item idx) > +{ > + return memcg_events_index[idx] - 1; > +} It's this. The table, memcg_vm_events[], is basically a "truth set" - if the event is in this set, update memcg_events_index[] for it. This code looks like one attempts to re-base C arrays to start-at-one though. It then does that accessor function to translate "unsigned zero" to "signed -1" ... for a truth test. > + > +struct memcg_vmstats_percpu { > + /* Local (CPU and cgroup) page state & events */ > + long state[MEMCG_NR_STAT]; > + unsigned long events[NR_MEMCG_EVENTS]; > + > + /* Delta calculation for lockless upward propagation */ > + long state_prev[MEMCG_NR_STAT]; > + unsigned long events_prev[NR_MEMCG_EVENTS]; > + > + /* Cgroup1: threshold notifications & softlimit tree updates */ > + unsigned long nr_page_events; > + unsigned long targets[MEM_CGROUP_NTARGETS]; > +}; > + > +struct memcg_vmstats { > + /* Aggregated (CPU and subtree) page state & events */ > + long state[MEMCG_NR_STAT]; > + unsigned long events[NR_MEMCG_EVENTS]; > + > + /* Pending child counts during tree propagation */ > + long state_pending[MEMCG_NR_STAT]; > + unsigned long events_pending[NR_MEMCG_EVENTS]; > +}; > + > +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) > +{ > + return READ_ONCE(memcg->vmstats->state[idx]); > +} > + > /** > * __mod_memcg_state - update cgroup memory statistics > * @memcg: the memory cgroup > @@ -831,25 +896,33 @@ static inline void mod_objcg_mlstate(struct > obj_cgroup *objcg, > void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, > unsigned long count) > { > - if (mem_cgroup_disabled()) > + int index = get_memcg_events_index(idx); > + > + if (mem_cgroup_disabled() || index < 0) And it's these where it's used; basically, this says "if the event is not in the monitored set, return". Likewise in all below. Why not have this explicit ? Make something like: static int memcg_event_exists_percg(int index) { switch (index) { case PGPGIN: case PGPGOUT: case PGFAULT: case PGMAJFAULT: case PGREFILL: case PGSCAN_KSWAPD: case PGSCAN_DIRECT: case PGSTEAL_KSWAPD: case PGSTEAL_DIRECT: case PGACTIVATE: case PGDEACTIVATE: case PGLAZYFREE: case PGLAZYFREED: #ifdef CONFIG_TRANSPARENT_HUGEPAGE case THP_FAULT_ALLOC: case THP_COLLAPSE_ALLOC: #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return 1; default: return 0; } } and let the compiler choose how to lay this out where-used ? > return; > > - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); > + __this_cpu_add(memcg->vmstats_percpu->events[index], count); [ ... ] I understand there is a memory saving from the different table types/sizes, while the vm events per memcg is a (small) subset of vm events. How relevant is this to the change, though? Best regards, Frank Hofmann