On Tue, Jun 16, 2020 at 07:57:54PM -0700, Shakeel Butt wrote: > On Mon, Jun 8, 2020 at 4:07 PM Roman Gushchin <guro@xxxxxx> wrote: > > > > To implement per-object slab memory accounting, we need to > > convert slab vmstat counters to bytes. Actually, out of > > 4 levels of counters: global, per-node, per-memcg and per-lruvec > > only two last levels will require byte-sized counters. > > It's because global and per-node counters will be counting the > > number of slab pages, and per-memcg and per-lruvec will be > > counting the amount of memory taken by charged slab objects. > > > > Converting all vmstat counters to bytes or even all slab > > counters to bytes would introduce an additional overhead. > > So instead let's store global and per-node counters > > in pages, and memcg and lruvec counters in bytes. > > > > To make the API clean all access helpers (both on the read > > and write sides) are dealing with bytes. > > > > The "dealing with bytes" is only for slab stats or all vmstat stats? Only slab stats as now. I've sent a percpu memory accounting patchset separately, which will add another byte-sized counter. > > > To avoid back-and-forth conversions a new flavor of read-side > > helpers is introduced, which always returns values in pages: > > node_page_state_pages() and global_node_page_state_pages(). > > > > Actually new helpers are just reading raw values. Old helpers are > > simple wrappers, which will complain on an attempt to read > > byte value, because at the moment no one actually needs bytes. > > > > Thanks to Johannes Weiner for the idea of having the byte-sized API > > on top of the page-sized internal storage. > > > > Signed-off-by: Roman Gushchin <guro@xxxxxx> > > Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx> > > Reviewed-by: Vlastimil Babka <vbabka@xxxxxxx> > > --- > > drivers/base/node.c | 2 +- > > include/linux/mmzone.h | 10 ++++++++++ > > include/linux/vmstat.h | 14 +++++++++++++- > > mm/memcontrol.c | 14 ++++++++++---- > > mm/vmstat.c | 30 ++++++++++++++++++++++++++---- > > 5 files changed, 60 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/base/node.c b/drivers/base/node.c > > index 5b02f69769e8..e21e31359297 100644 > > --- a/drivers/base/node.c > > +++ b/drivers/base/node.c > > @@ -513,7 +513,7 @@ static ssize_t node_read_vmstat(struct device *dev, > > > > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > > n += sprintf(buf+n, "%s %lu\n", node_stat_name(i), > > - node_page_state(pgdat, i)); > > + node_page_state_pages(pgdat, i)); > > > > return n; > > } > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index c4c37fd12104..fa8eb49d9898 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -206,6 +206,16 @@ enum node_stat_item { > > NR_VM_NODE_STAT_ITEMS > > }; > > > > +/* > > + * Returns true if the value is measured in bytes (most vmstat values are > > + * measured in pages). This defines the API part, the internal representation > > + * might be different. > > + */ > > +static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) > > +{ > > + return false; > > +} > > + > > /* > > * We do arithmetic on the LRU lists in various places in the code, > > * so it is important to keep the active lists LRU_ACTIVE higher in > > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > > index aa961088c551..91220ace31da 100644 > > --- a/include/linux/vmstat.h > > +++ b/include/linux/vmstat.h > > @@ -8,6 +8,7 @@ > > #include <linux/vm_event_item.h> > > #include <linux/atomic.h> > > #include <linux/static_key.h> > > +#include <linux/mmdebug.h> > > > > extern int sysctl_stat_interval; > > > > @@ -192,7 +193,8 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item) > > return x; > > } > > > > -static inline unsigned long global_node_page_state(enum node_stat_item item) > > +static inline > > +unsigned long global_node_page_state_pages(enum node_stat_item item) > > { > > long x = atomic_long_read(&vm_node_stat[item]); > > #ifdef CONFIG_SMP > > @@ -202,6 +204,13 @@ static inline unsigned long global_node_page_state(enum node_stat_item item) > > return x; > > } > > > > +static inline unsigned long global_node_page_state(enum node_stat_item item) > > +{ > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > + return global_node_page_state_pages(item); > > +} > > + > > static inline unsigned long zone_page_state(struct zone *zone, > > enum zone_stat_item item) > > { > > @@ -242,9 +251,12 @@ extern unsigned long sum_zone_node_page_state(int node, > > extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); > > extern unsigned long node_page_state(struct pglist_data *pgdat, > > enum node_stat_item item); > > +extern unsigned long node_page_state_pages(struct pglist_data *pgdat, > > + enum node_stat_item item); > > #else > > #define sum_zone_node_page_state(node, item) global_zone_page_state(item) > > #define node_page_state(node, item) global_node_page_state(item) > > +#define node_page_state_pages(node, item) global_node_page_state_pages(item) > > #endif /* CONFIG_NUMA */ > > > > #ifdef CONFIG_SMP > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > index e8a91e98556b..07d02e61a73e 100644 > > --- a/mm/memcontrol.c > > +++ b/mm/memcontrol.c > > @@ -681,13 +681,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > > */ > > void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) > > { > > - long x; > > + long x, threshold = MEMCG_CHARGE_BATCH; > > > > if (mem_cgroup_disabled()) > > return; > > > > + if (vmstat_item_in_bytes(idx)) > > + threshold <<= PAGE_SHIFT; > > + > > From the above am I understanding correctly that even after moving to > byte-level accounting, we can still see stats with potential error > limited by (BATCH-1)*PAGE_SIZE*nr_cpus? > > > x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); > > - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { > > + if (unlikely(abs(x) > threshold)) { > > struct mem_cgroup *mi; > > > > /* > > @@ -718,7 +721,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, > > { > > struct mem_cgroup_per_node *pn; > > struct mem_cgroup *memcg; > > - long x; > > + long x, threshold = MEMCG_CHARGE_BATCH; > > > > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > > memcg = pn->memcg; > > @@ -729,8 +732,11 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, > > /* Update lruvec */ > > __this_cpu_add(pn->lruvec_stat_local->count[idx], val); > > > > + if (vmstat_item_in_bytes(idx)) > > + threshold <<= PAGE_SHIFT; > > + > > x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); > > - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { > > + if (unlikely(abs(x) > threshold)) { > > pg_data_t *pgdat = lruvec_pgdat(lruvec); > > struct mem_cgroup_per_node *pi; > > > > diff --git a/mm/vmstat.c b/mm/vmstat.c > > index 80c9b6221535..f1c321e1d6d3 100644 > > --- a/mm/vmstat.c > > +++ b/mm/vmstat.c > > @@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, > > long x; > > long t; > > > > + if (vmstat_item_in_bytes(item)) { > > + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > + delta >>= PAGE_SHIFT; > > + } > > + > > x = delta + __this_cpu_read(*p); > > > > t = __this_cpu_read(pcp->stat_threshold); > > @@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > s8 v, t; > > > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > v = __this_cpu_inc_return(*p); > > t = __this_cpu_read(pcp->stat_threshold); > > if (unlikely(v > t)) { > > @@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > s8 v, t; > > > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > v = __this_cpu_dec_return(*p); > > t = __this_cpu_read(pcp->stat_threshold); > > if (unlikely(v < - t)) { > > @@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat, > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > long o, n, t, z; > > > > + if (vmstat_item_in_bytes(item)) { > > + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > + delta >>= PAGE_SHIFT; > > + } > > + > > do { > > z = 0; /* overflow to node counters */ > > > > @@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node, > > /* > > * Determine the per node value of a stat item. > > */ > > -unsigned long node_page_state(struct pglist_data *pgdat, > > - enum node_stat_item item) > > +unsigned long node_page_state_pages(struct pglist_data *pgdat, > > + enum node_stat_item item) > > { > > long x = atomic_long_read(&pgdat->vm_stat[item]); > > #ifdef CONFIG_SMP > > @@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat, > > #endif > > return x; > > } > > + > > +unsigned long node_page_state(struct pglist_data *pgdat, > > + enum node_stat_item item) > > +{ > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > + return node_page_state_pages(pgdat, item); > > +} > > So, for non-slab, node_page_state and node_page_state_pages will be > the same but different for slab vmstats. However we should not be > calling node_page_state with slab vmstats because we don't need it, > right? Right.