On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@xxxxxxxxx> wrote: > > At the moment, the amount of memory allocated for stats related structs > in the mem_cgroup corresponds to the size of enum node_stat_item. > However not all fields in enum node_stat_item has corresponding memcg typo: "have corresponding" > stats. So, let's use indirection mechanism similar to the one used for > memcg vmstats management. > > For a given x86_64 config, the size of stats with and without patch is: > > structs size in bytes w/o with > > struct lruvec_stats 1128 648 > struct lruvec_stats_percpu 752 432 > struct memcg_vmstats 1832 1352 > struct memcg_vmstats_percpu 1280 960 > > The memory savings is further compounded by the fact that these structs > are allocated for each cpu and for each node. To be precise, for each > memcg the memory saved would be: > > Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) + typo: "NR_NODES" > (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long) > > Where 21 is the number of fields eliminated. > > Signed-off-by: Shakeel Butt <shakeel.butt@xxxxxxxxx> > --- > > Changes since v2: > - N/A > > mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 115 insertions(+), 23 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 434cff91b65e..f424c5b2ba9b 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > return mz; > } > > +/* Subset of node_stat_item for memcg stats */ > +static const unsigned int memcg_node_stat_items[] = { > + NR_INACTIVE_ANON, > + NR_ACTIVE_ANON, > + NR_INACTIVE_FILE, > + NR_ACTIVE_FILE, > + NR_UNEVICTABLE, > + NR_SLAB_RECLAIMABLE_B, > + NR_SLAB_UNRECLAIMABLE_B, > + WORKINGSET_REFAULT_ANON, > + WORKINGSET_REFAULT_FILE, > + WORKINGSET_ACTIVATE_ANON, > + WORKINGSET_ACTIVATE_FILE, > + WORKINGSET_RESTORE_ANON, > + WORKINGSET_RESTORE_FILE, > + WORKINGSET_NODERECLAIM, > + NR_ANON_MAPPED, > + NR_FILE_MAPPED, > + NR_FILE_PAGES, > + NR_FILE_DIRTY, > + NR_WRITEBACK, > + NR_SHMEM, > + NR_SHMEM_THPS, > + NR_FILE_THPS, > + NR_ANON_THPS, > + NR_KERNEL_STACK_KB, > + NR_PAGETABLE, > + NR_SECONDARY_PAGETABLE, > +#ifdef CONFIG_SWAP > + NR_SWAPCACHE, > +#endif > +}; > + > +static const unsigned int memcg_stat_items[] = { > + MEMCG_SWAP, > + MEMCG_SOCK, > + MEMCG_PERCPU_B, > + MEMCG_VMALLOC, > + MEMCG_KMEM, > + MEMCG_ZSWAP_B, > + MEMCG_ZSWAPPED, > +}; > + > +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) > +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items)) > +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; > + > +static void init_memcg_stats(void) > +{ > + int8_t i, j = 0; > + > + /* Switch to short once this failure occurs. */ > + BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */); > + > + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i) > + mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j; > + > + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i) > + mem_cgroup_stats_index[memcg_stat_items[i]] = ++j; > +} > + > +static inline int memcg_stats_index(int idx) > +{ > + return mem_cgroup_stats_index[idx] - 1; Could this just be: return mem_cgroup_stats_index[idx]; with a postfix increment of j in init_memcg_stats instead of prefix increment? > +} > + > struct lruvec_stats_percpu { > /* Local (CPU and cgroup) state */ > - long state[NR_VM_NODE_STAT_ITEMS]; > + long state[NR_MEMCG_NODE_STAT_ITEMS]; > > /* Delta calculation for lockless upward propagation */ > - long state_prev[NR_VM_NODE_STAT_ITEMS]; > + long state_prev[NR_MEMCG_NODE_STAT_ITEMS]; > }; > > struct lruvec_stats { > /* Aggregated (CPU and subtree) state */ > - long state[NR_VM_NODE_STAT_ITEMS]; > + long state[NR_MEMCG_NODE_STAT_ITEMS]; > > /* Non-hierarchical (CPU aggregated) state */ > - long state_local[NR_VM_NODE_STAT_ITEMS]; > + long state_local[NR_MEMCG_NODE_STAT_ITEMS]; > > /* Pending child counts during tree propagation */ > - long state_pending[NR_VM_NODE_STAT_ITEMS]; > + long state_pending[NR_MEMCG_NODE_STAT_ITEMS]; > }; > > unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) > { > struct mem_cgroup_per_node *pn; > - long x; > + long x = 0; > + int i; > > if (mem_cgroup_disabled()) > return node_page_state(lruvec_pgdat(lruvec), idx); > > - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > - x = READ_ONCE(pn->lruvec_stats->state[idx]); > + i = memcg_stats_index(idx); > + if (i >= 0) { > + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > + x = READ_ONCE(pn->lruvec_stats->state[i]); > + } > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > @@ -617,12 +687,16 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, > { > struct mem_cgroup_per_node *pn; > long x = 0; > + int i; > > if (mem_cgroup_disabled()) > return node_page_state(lruvec_pgdat(lruvec), idx); > > - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > - x = READ_ONCE(pn->lruvec_stats->state_local[idx]); > + i = memcg_stats_index(idx); > + if (i >= 0) { > + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > + x = READ_ONCE(pn->lruvec_stats->state_local[i]); > + } > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > @@ -689,11 +763,11 @@ struct memcg_vmstats_percpu { > /* The above should fit a single cacheline for memcg_rstat_updated() */ > > /* Local (CPU and cgroup) page state & events */ > - long state[MEMCG_NR_STAT]; > + long state[NR_MEMCG_STATS]; > unsigned long events[NR_MEMCG_EVENTS]; > > /* Delta calculation for lockless upward propagation */ > - long state_prev[MEMCG_NR_STAT]; > + long state_prev[NR_MEMCG_STATS]; > unsigned long events_prev[NR_MEMCG_EVENTS]; > > /* Cgroup1: threshold notifications & softlimit tree updates */ > @@ -703,15 +777,15 @@ struct memcg_vmstats_percpu { > > struct memcg_vmstats { > /* Aggregated (CPU and subtree) page state & events */ > - long state[MEMCG_NR_STAT]; > + long state[NR_MEMCG_STATS]; > unsigned long events[NR_MEMCG_EVENTS]; > > /* Non-hierarchical (CPU aggregated) page state & events */ > - long state_local[MEMCG_NR_STAT]; > + long state_local[NR_MEMCG_STATS]; > unsigned long events_local[NR_MEMCG_EVENTS]; > > /* Pending child counts during tree propagation */ > - long state_pending[MEMCG_NR_STAT]; > + long state_pending[NR_MEMCG_STATS]; > unsigned long events_pending[NR_MEMCG_EVENTS]; > > /* Stats updates since the last flush */ > @@ -844,7 +918,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w) > > unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) > { > - long x = READ_ONCE(memcg->vmstats->state[idx]); > + long x; > + int i = memcg_stats_index(idx); > + > + if (i < 0) > + return 0; > + > + x = READ_ONCE(memcg->vmstats->state[i]); > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > @@ -876,18 +956,25 @@ static int memcg_state_val_in_pages(int idx, int val) > */ > void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) > { > - if (mem_cgroup_disabled()) > + int i = memcg_stats_index(idx); > + > + if (mem_cgroup_disabled() || i < 0) > return; > > - __this_cpu_add(memcg->vmstats_percpu->state[idx], val); > + __this_cpu_add(memcg->vmstats_percpu->state[i], val); > memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); > } > > /* idx can be of type enum memcg_stat_item or node_stat_item. */ > static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) > { > - long x = READ_ONCE(memcg->vmstats->state_local[idx]); > + long x; > + int i = memcg_stats_index(idx); > + > + if (i < 0) > + return 0; > > + x = READ_ONCE(memcg->vmstats->state_local[i]); > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > @@ -901,6 +988,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, > { > struct mem_cgroup_per_node *pn; > struct mem_cgroup *memcg; > + int i = memcg_stats_index(idx); > + > + if (i < 0) > + return; > > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > memcg = pn->memcg; > @@ -930,10 +1021,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, > } > > /* Update memcg */ > - __this_cpu_add(memcg->vmstats_percpu->state[idx], val); > + __this_cpu_add(memcg->vmstats_percpu->state[i], val); > > /* Update lruvec */ > - __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); > + __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); > > memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); > memcg_stats_unlock(); > @@ -5702,6 +5793,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) > page_counter_init(&memcg->kmem, &parent->kmem); > page_counter_init(&memcg->tcpmem, &parent->tcpmem); > } else { > + init_memcg_stats(); > init_memcg_events(); > page_counter_init(&memcg->memory, NULL); > page_counter_init(&memcg->swap, NULL); > @@ -5873,7 +5965,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > > statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); > > - for (i = 0; i < MEMCG_NR_STAT; i++) { > + for (i = 0; i < NR_MEMCG_STATS; i++) { > /* > * Collect the aggregated propagation counts of groups > * below us. We're in a per-cpu loop here and this is > @@ -5937,7 +6029,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > > lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); > > - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { > + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) { > delta = lstats->state_pending[i]; > if (delta) > lstats->state_pending[i] = 0; > -- > 2.43.0 >