On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote: > Before the new slab memory controller with per object byte charging, > charging and vmstat data update happen only when new slab pages are > allocated or freed. Now they are done with every kmem_cache_alloc() > and kmem_cache_free(). This causes additional overhead for workloads > that generate a lot of alloc and free calls. > > The memcg_stock_pcp is used to cache byte charge for a specific > obj_cgroup to reduce that overhead. To further reducing it, this patch > makes the vmstat data cached in the memcg_stock_pcp structure as well > until it accumulates a page size worth of update or when other cached > data change. Caching the vmstat data in the per-cpu stock eliminates two > writes to non-hot cachelines for memcg specific as well as memcg-lruvecs > specific vmstat data by a write to a hot local stock cacheline. > > On a 2-socket Cascade Lake server with instrumentation enabled and this > patch applied, it was found that about 20% (634400 out of 3243830) > of the time when mod_objcg_state() is called leads to an actual call > to __mod_objcg_state() after initial boot. When doing parallel kernel > build, the figure was about 17% (24329265 out of 142512465). So caching > the vmstat data reduces the number of calls to __mod_objcg_state() > by more than 80%. > > Signed-off-by: Waiman Long <longman@xxxxxxxxxx> > Reviewed-by: Shakeel Butt <shakeelb@xxxxxxxxxx> > --- > mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 83 insertions(+), 3 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 7cd7187a017c..292b4783b1a7 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) > rcu_read_unlock(); > } > > -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > - enum node_stat_item idx, int nr) > +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, > + struct pglist_data *pgdat, > + enum node_stat_item idx, int nr) > { > struct mem_cgroup *memcg; > struct lruvec *lruvec; > @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > rcu_read_lock(); > memcg = obj_cgroup_memcg(objcg); > lruvec = mem_cgroup_lruvec(memcg, pgdat); > - mod_memcg_lruvec_state(lruvec, idx, nr); > + __mod_memcg_lruvec_state(lruvec, idx, nr); > rcu_read_unlock(); > } > > @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { > > #ifdef CONFIG_MEMCG_KMEM > struct obj_cgroup *cached_objcg; > + struct pglist_data *cached_pgdat; I wonder if we want to have per-node counters instead? That would complicate the initialization of pcp stocks a bit, but might shave off some additional cpu time. But we can do it later too. > unsigned int nr_bytes; > + int nr_slab_reclaimable_b; > + int nr_slab_unreclaimable_b; > #endif > > struct work_struct work; > @@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) > obj_cgroup_put(objcg); > } > > +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > + enum node_stat_item idx, int nr) > +{ > + struct memcg_stock_pcp *stock; > + unsigned long flags; > + int *bytes; > + > + local_irq_save(flags); > + stock = this_cpu_ptr(&memcg_stock); > + > + /* > + * Save vmstat data in stock and skip vmstat array update unless > + * accumulating over a page of vmstat data or when pgdat or idx > + * changes. > + */ > + if (stock->cached_objcg != objcg) { > + drain_obj_stock(stock); > + obj_cgroup_get(objcg); > + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) > + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; > + stock->cached_objcg = objcg; > + stock->cached_pgdat = pgdat; > + } else if (stock->cached_pgdat != pgdat) { > + /* Flush the existing cached vmstat data */ > + if (stock->nr_slab_reclaimable_b) { > + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, > + stock->nr_slab_reclaimable_b); > + stock->nr_slab_reclaimable_b = 0; > + } > + if (stock->nr_slab_unreclaimable_b) { > + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, > + stock->nr_slab_unreclaimable_b); > + stock->nr_slab_unreclaimable_b = 0; > + } > + stock->cached_pgdat = pgdat; > + } > + > + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b > + : &stock->nr_slab_unreclaimable_b; > + if (!*bytes) { > + *bytes = nr; > + nr = 0; > + } else { > + *bytes += nr; > + if (abs(*bytes) > PAGE_SIZE) { > + nr = *bytes; > + *bytes = 0; > + } else { > + nr = 0; > + } > + } This part is a little bit hard to follow, how about something like this (completely untested): { stocked = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b : &stock->nr_slab_unreclaimable_b; if (abs(*stocked + nr) > PAGE_SIZE) { nr += *stocked; *stocked = 0; } else { *stocked += nr; nr = 0; } } > + if (nr) > + mod_objcg_mlstate(objcg, pgdat, idx, nr); > + > + local_irq_restore(flags); > +} > + > static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) > { > struct memcg_stock_pcp *stock; > @@ -3055,6 +3116,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) > stock->nr_bytes = 0; > } > > + /* > + * Flush the vmstat data in current stock > + */ > + if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { > + if (stock->nr_slab_reclaimable_b) { > + mod_objcg_mlstate(old, stock->cached_pgdat, > + NR_SLAB_RECLAIMABLE_B, > + stock->nr_slab_reclaimable_b); > + stock->nr_slab_reclaimable_b = 0; > + } > + if (stock->nr_slab_unreclaimable_b) { > + mod_objcg_mlstate(old, stock->cached_pgdat, > + NR_SLAB_UNRECLAIMABLE_B, > + stock->nr_slab_unreclaimable_b); > + stock->nr_slab_unreclaimable_b = 0; > + } > + stock->cached_pgdat = NULL; > + } > + > obj_cgroup_put(old); > stock->cached_objcg = NULL; > } > -- > 2.18.1 >