On Sun, Apr 18, 2021 at 08:00:31PM -0400, Waiman Long wrote: > Currently, the object stock structure caches either reclaimable vmstat > bytes or unreclaimable vmstat bytes in its object stock structure. The > hit rate can be improved if both types of vmstat data can be cached > especially for single-node system. > > This patch supports the cacheing of both type of vmstat data, though > at the expense of a slightly increased complexity in the caching code. > For large object (>= PAGE_SIZE), vmstat array is done directly without > going through the stock caching step. > > On a 2-socket Cascade Lake server with instrumentation enabled, the > miss rates are shown in the table below. > > Initial bootup: > > Kernel __mod_objcg_state mod_objcg_state %age > ------ ----------------- --------------- ---- > Before patch 634400 3243830 19.6% > After patch 419810 3182424 13.2% > > Parallel kernel build: > > Kernel __mod_objcg_state mod_objcg_state %age > ------ ----------------- --------------- ---- > Before patch 24329265 142512465 17.1% > After patch 24051721 142445825 16.9% > > There was a decrease of miss rate after initial system bootup. However, > the miss rate for parallel kernel build remained about the same probably > because most of the touched kmemcache objects were reclaimable inodes > and dentries. > > Signed-off-by: Waiman Long <longman@xxxxxxxxxx> > --- > mm/memcontrol.c | 79 +++++++++++++++++++++++++++++++------------------ > 1 file changed, 51 insertions(+), 28 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index c13502eab282..a6dd18f6d8a8 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2212,8 +2212,8 @@ struct obj_stock { > struct obj_cgroup *cached_objcg; > struct pglist_data *cached_pgdat; > unsigned int nr_bytes; > - int vmstat_idx; > - int vmstat_bytes; > + int reclaimable_bytes; /* NR_SLAB_RECLAIMABLE_B */ > + int unreclaimable_bytes; /* NR_SLAB_UNRECLAIMABLE_B */ How about int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; so you don't need the comments? > #else > int dummy[0]; > #endif > @@ -3217,40 +3217,56 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > enum node_stat_item idx, int nr) > { > unsigned long flags; > - struct obj_stock *stock = get_obj_stock(&flags); > + struct obj_stock *stock; > + int *bytes, *alt_bytes, alt_idx; > + > + /* > + * Directly update vmstat array for big object. > + */ > + if (unlikely(abs(nr) >= PAGE_SIZE)) > + goto update_vmstat; This looks like an optimization independent of the vmstat item split? > + stock = get_obj_stock(&flags); > + if (idx == NR_SLAB_RECLAIMABLE_B) { > + bytes = &stock->reclaimable_bytes; > + alt_bytes = &stock->unreclaimable_bytes; > + alt_idx = NR_SLAB_UNRECLAIMABLE_B; > + } else { > + bytes = &stock->unreclaimable_bytes; > + alt_bytes = &stock->reclaimable_bytes; > + alt_idx = NR_SLAB_RECLAIMABLE_B; > + } > > /* > - * Save vmstat data in stock and skip vmstat array update unless > - * accumulating over a page of vmstat data or when pgdat or idx > + * Try to save vmstat data in stock and skip vmstat array update > + * unless accumulating over a page of vmstat data or when pgdat > * changes. > */ > if (stock->cached_objcg != objcg) { > /* Output the current data as is */ > - } else if (!stock->vmstat_bytes) { > - /* Save the current data */ > - stock->vmstat_bytes = nr; > - stock->vmstat_idx = idx; > - stock->cached_pgdat = pgdat; > - nr = 0; > - } else if ((stock->cached_pgdat != pgdat) || > - (stock->vmstat_idx != idx)) { > - /* Output the cached data & save the current data */ > - swap(nr, stock->vmstat_bytes); > - swap(idx, stock->vmstat_idx); > + } else if (stock->cached_pgdat != pgdat) { > + /* Save the current data and output cached data, if any */ > + swap(nr, *bytes); > swap(pgdat, stock->cached_pgdat); > + if (*alt_bytes) { > + __mod_objcg_state(objcg, pgdat, alt_idx, > + *alt_bytes); > + *alt_bytes = 0; > + } As per the other email, I really don't think optimizing the pgdat switch (in a percpu cache) is worth this level of complexity. > } else { > - stock->vmstat_bytes += nr; > - if (abs(stock->vmstat_bytes) > PAGE_SIZE) { > - nr = stock->vmstat_bytes; > - stock->vmstat_bytes = 0; > + *bytes += nr; > + if (abs(*bytes) > PAGE_SIZE) { > + nr = *bytes; > + *bytes = 0; > } else { > nr = 0; > } > } > - if (nr) > - __mod_objcg_state(objcg, pgdat, idx, nr); > - > put_obj_stock(flags); > + if (!nr) > + return; > +update_vmstat: > + __mod_objcg_state(objcg, pgdat, idx, nr); > } > > static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) > @@ -3303,12 +3319,19 @@ static void drain_obj_stock(struct obj_stock *stock) > /* > * Flush the vmstat data in current stock > */ > - if (stock->vmstat_bytes) { > - __mod_objcg_state(old, stock->cached_pgdat, stock->vmstat_idx, > - stock->vmstat_bytes); > + if (stock->reclaimable_bytes || stock->unreclaimable_bytes) { > + int bytes; > + > + if ((bytes = stock->reclaimable_bytes)) > + __mod_objcg_state(old, stock->cached_pgdat, > + NR_SLAB_RECLAIMABLE_B, bytes); > + if ((bytes = stock->unreclaimable_bytes)) > + __mod_objcg_state(old, stock->cached_pgdat, > + NR_SLAB_UNRECLAIMABLE_B, bytes); The int bytes indirection isn't necessary. It's easier to read even with the extra lines required to repeat the long stock member names, because that is quite a common pattern (if (stuff) frob(stuff)). __mod_objcg_state() also each time does rcu_read_lock() toggling and a memcg lookup that could be batched, which I think is further proof that it should just be inlined here.