On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote: > To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we > need to dynamically allocate lruvec_stats in the mem_cgroup_per_node > structure. Also move the definition of lruvec_stats_percpu and > lruvec_stats and related functions to the memcontrol.c to facilitate > later patches. No functional changes in the patch. > > Signed-off-by: Shakeel Butt <shakeel.butt@xxxxxxxxx> > --- > include/linux/memcontrol.h | 62 +++------------------------ > mm/memcontrol.c | 87 ++++++++++++++++++++++++++++++++------ > 2 files changed, 81 insertions(+), 68 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 9aba0d0462ca..ab8a6e884375 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -83,6 +83,8 @@ enum mem_cgroup_events_target { > > struct memcg_vmstats_percpu; > struct memcg_vmstats; > +struct lruvec_stats_percpu; > +struct lruvec_stats; > > struct mem_cgroup_reclaim_iter { > struct mem_cgroup *position; > @@ -90,25 +92,6 @@ struct mem_cgroup_reclaim_iter { > unsigned int generation; > }; > > -struct lruvec_stats_percpu { > - /* Local (CPU and cgroup) state */ > - long state[NR_VM_NODE_STAT_ITEMS]; > - > - /* Delta calculation for lockless upward propagation */ > - long state_prev[NR_VM_NODE_STAT_ITEMS]; > -}; > - > -struct lruvec_stats { > - /* Aggregated (CPU and subtree) state */ > - long state[NR_VM_NODE_STAT_ITEMS]; > - > - /* Non-hierarchical (CPU aggregated) state */ > - long state_local[NR_VM_NODE_STAT_ITEMS]; > - > - /* Pending child counts during tree propagation */ > - long state_pending[NR_VM_NODE_STAT_ITEMS]; > -}; > - > /* > * per-node information in memory controller. > */ > @@ -116,7 +99,7 @@ struct mem_cgroup_per_node { > struct lruvec lruvec; > > struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; > - struct lruvec_stats lruvec_stats; > + struct lruvec_stats *lruvec_stats; > > unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; > > @@ -1037,42 +1020,9 @@ static inline void mod_memcg_page_state(struct page *page, > } > > unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); > - > -static inline unsigned long lruvec_page_state(struct lruvec *lruvec, > - enum node_stat_item idx) > -{ > - struct mem_cgroup_per_node *pn; > - long x; > - > - if (mem_cgroup_disabled()) > - return node_page_state(lruvec_pgdat(lruvec), idx); > - > - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > - x = READ_ONCE(pn->lruvec_stats.state[idx]); > -#ifdef CONFIG_SMP > - if (x < 0) > - x = 0; > -#endif > - return x; > -} > - > -static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, > - enum node_stat_item idx) > -{ > - struct mem_cgroup_per_node *pn; > - long x = 0; > - > - if (mem_cgroup_disabled()) > - return node_page_state(lruvec_pgdat(lruvec), idx); > - > - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > - x = READ_ONCE(pn->lruvec_stats.state_local[idx]); > -#ifdef CONFIG_SMP > - if (x < 0) > - x = 0; > -#endif > - return x; > -} > +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); > +unsigned long lruvec_page_state_local(struct lruvec *lruvec, > + enum node_stat_item idx); > > void mem_cgroup_flush_stats(struct mem_cgroup *memcg); > void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 53769d06053f..5e337ed6c6bf 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -576,6 +576,60 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > return mz; > } > > +struct lruvec_stats_percpu { > + /* Local (CPU and cgroup) state */ > + long state[NR_VM_NODE_STAT_ITEMS]; > + > + /* Delta calculation for lockless upward propagation */ > + long state_prev[NR_VM_NODE_STAT_ITEMS]; > +}; > + > +struct lruvec_stats { > + /* Aggregated (CPU and subtree) state */ > + long state[NR_VM_NODE_STAT_ITEMS]; > + > + /* Non-hierarchical (CPU aggregated) state */ > + long state_local[NR_VM_NODE_STAT_ITEMS]; > + > + /* Pending child counts during tree propagation */ > + long state_pending[NR_VM_NODE_STAT_ITEMS]; > +}; > + > +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) > +{ > + struct mem_cgroup_per_node *pn; > + long x; > + > + if (mem_cgroup_disabled()) > + return node_page_state(lruvec_pgdat(lruvec), idx); > + > + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > + x = READ_ONCE(pn->lruvec_stats->state[idx]); > +#ifdef CONFIG_SMP > + if (x < 0) > + x = 0; > +#endif > + return x; > +} > + > +unsigned long lruvec_page_state_local(struct lruvec *lruvec, > + enum node_stat_item idx) > +{ > + struct mem_cgroup_per_node *pn; > + long x = 0; > + > + if (mem_cgroup_disabled()) > + return node_page_state(lruvec_pgdat(lruvec), idx); > + > + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > + x = READ_ONCE(pn->lruvec_stats->state_local[idx]); > +#ifdef CONFIG_SMP > + if (x < 0) > + x = 0; > +#endif Not directly related to your change, but do we still need it? And if yes, do we really care about !CONFIG_SMP case enough to justify these #ifdefs? > + return x; > +} > + > /* Subset of vm_event_item to report for memcg event stats */ > static const unsigned int memcg_vm_event_stat[] = { > PGPGIN, > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > if (!pn) > return 1; > > + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL, > + node); Why not GFP_KERNEL_ACCOUNT? > + if (!pn->lruvec_stats) > + goto fail; > + > pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, > GFP_KERNEL_ACCOUNT); > - if (!pn->lruvec_stats_percpu) { > - kfree(pn); > - return 1; > - } > + if (!pn->lruvec_stats_percpu) > + goto fail; > > lruvec_init(&pn->lruvec); > pn->memcg = memcg; > > memcg->nodeinfo[node] = pn; > return 0; > +fail: > + kfree(pn->lruvec_stats); > + kfree(pn); > + return 1; > } > > static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > @@ -5514,6 +5575,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > return; > > free_percpu(pn->lruvec_stats_percpu); > + kfree(pn->lruvec_stats); > kfree(pn); > } > > @@ -5866,18 +5928,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > > for_each_node_state(nid, N_MEMORY) { > struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; > - struct mem_cgroup_per_node *ppn = NULL; > + struct lruvec_stats *lstats = pn->lruvec_stats; > + struct lruvec_stats *plstats = NULL; > struct lruvec_stats_percpu *lstatc; > > if (parent) > - ppn = parent->nodeinfo[nid]; > + plstats = parent->nodeinfo[nid]->lruvec_stats; > > lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); > > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { > - delta = pn->lruvec_stats.state_pending[i]; > + delta = lstats->state_pending[i]; > if (delta) > - pn->lruvec_stats.state_pending[i] = 0; > + lstats->state_pending[i] = 0; > > delta_cpu = 0; > v = READ_ONCE(lstatc->state[i]); > @@ -5888,12 +5951,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > } > > if (delta_cpu) > - pn->lruvec_stats.state_local[i] += delta_cpu; > + lstats->state_local[i] += delta_cpu; > > if (delta) { > - pn->lruvec_stats.state[i] += delta; > - if (ppn) > - ppn->lruvec_stats.state_pending[i] += delta; > + lstats->state[i] += delta; > + if (plstats) > + plstats->state_pending[i] += delta; > } > } > } > -- > 2.43.0 >