[patch 01/16] mm: memcontrol: don't batch updates of local VM stats and events

akpm@xxxxxxxxxxxxxxxxxxxx · Thu, 13 Jun 2019 15:55:46 -0700

From: Johannes Weiner <hannes@xxxxxxxxxxx>
Subject: mm: memcontrol: don't batch updates of local VM stats and events

The kernel test robot noticed a 26% will-it-scale pagefault regression
from 42a300353577 ("mm: memcontrol: fix recursive statistics correctness &
scalabilty").  This appears to be caused by bouncing the additional
cachelines from the new hierarchical statistics counters.

We can fix this by getting rid of the batched local counters instead.

Originally, there were *only* group-local counters, and they were fully
maintained per cpu.  A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view.  This was prohibitively expensive, and
so we switched to per-cpu batched updates of the local counters during
a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat
reporting"), reducing the complexity from nr_subgroups * nr_cpus to
nr_subgroups.

With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level.  When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.

This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.

Since local counter reads were never a problem - we only centralized them
to accelerate the hierarchy walk - and use of the local counters are
becoming rarer due to replacement with hierarchical views (ongoing rework
in the page reclaim and workingset code), we can make those local counters
unbatched per-cpu counters again.

The scheme will then be as such:

   when a memcg statistic changes, the writer will:
   - update the local counter (per-cpu)
   - update the batch counter (per-cpu). If the batch is full:
   - spill the batch into the group's atomic_t
   - spill the batch into all ancestors' atomic_ts
   - empty out the batch counter (per-cpu)

   when a local memcg counter is read, the reader will:
   - collect the local counter from all cpus

   when a hiearchy memcg counter is read, the reader will:
   - read the atomic_t

We might be able to simplify this further and make the recursive counters
unbatched per-cpu counters as well (batch upward propagation, but leave
per-cpu collection to the readers), but that will require a more in-depth
analysis and testing of all the callsites.  Deal with the immediate
regression for now.

Link: http://lkml.kernel.org/r/20190521151647.GB2870@xxxxxxxxxxx
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Reported-by: kernel test robot <rong.a.chen@xxxxxxxxx>
Tested-by: kernel test robot <rong.a.chen@xxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Shakeel Butt <shakeelb@xxxxxxxxxx>
Cc: Roman Gushchin <guro@xxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 include/linux/memcontrol.h |   26 +++++++++++++++-------
 mm/memcontrol.c            |   41 +++++++++++++++++++++++------------
 2 files changed, 46 insertions(+), 21 deletions(-)

--- a/include/linux/memcontrol.h~mm-memcontrol-dont-batch-updates-of-local-vm-stats-and-events
+++ a/include/linux/memcontrol.h
@@ -117,9 +117,12 @@ struct memcg_shrinker_map {
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
 
+	/* Legacy local VM stats */
+	struct lruvec_stat __percpu *lruvec_stat_local;
+
+	/* Subtree VM stats (batched updates) */
 	struct lruvec_stat __percpu *lruvec_stat_cpu;
 	atomic_long_t		lruvec_stat[NR_VM_NODE_STAT_ITEMS];
-	atomic_long_t		lruvec_stat_local[NR_VM_NODE_STAT_ITEMS];
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
@@ -265,17 +268,18 @@ struct mem_cgroup {
 	atomic_t		moving_account;
 	struct task_struct	*move_lock_task;
 
-	/* memory.stat */
+	/* Legacy local VM stats and events */
+	struct memcg_vmstats_percpu __percpu *vmstats_local;
+
+	/* Subtree VM stats and events (batched updates) */
 	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
 	MEMCG_PADDING(_pad2_);
 
 	atomic_long_t		vmstats[MEMCG_NR_STAT];
-	atomic_long_t		vmstats_local[MEMCG_NR_STAT];
-
 	atomic_long_t		vmevents[NR_VM_EVENT_ITEMS];
-	atomic_long_t		vmevents_local[NR_VM_EVENT_ITEMS];
 
+	/* memory.events */
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
 
 	unsigned long		socket_pressure;
@@ -567,7 +571,11 @@ static inline unsigned long memcg_page_s
 static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
 						   int idx)
 {
-	long x = atomic_long_read(&memcg->vmstats_local[idx]);
+	long x = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -641,13 +649,15 @@ static inline unsigned long lruvec_page_
 						    enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
-	long x;
+	long x = 0;
+	int cpu;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = atomic_long_read(&pn->lruvec_stat_local[idx]);
+	for_each_possible_cpu(cpu)
+		x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
--- a/mm/memcontrol.c~mm-memcontrol-dont-batch-updates-of-local-vm-stats-and-events
+++ a/mm/memcontrol.c
@@ -691,11 +691,12 @@ void __mod_memcg_state(struct mem_cgroup
 	if (mem_cgroup_disabled())
 		return;
 
+	__this_cpu_add(memcg->vmstats_local->stat[idx], val);
+
 	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 		struct mem_cgroup *mi;
 
-		atomic_long_add(x, &memcg->vmstats_local[idx]);
 		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 			atomic_long_add(x, &mi->vmstats[idx]);
 		x = 0;
@@ -745,11 +746,12 @@ void __mod_lruvec_state(struct lruvec *l
 	__mod_memcg_state(memcg, idx, val);
 
 	/* Update lruvec */
+	__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
 	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
 	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 		struct mem_cgroup_per_node *pi;
 
-		atomic_long_add(x, &pn->lruvec_stat_local[idx]);
 		for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
 			atomic_long_add(x, &pi->lruvec_stat[idx]);
 		x = 0;
@@ -771,11 +773,12 @@ void __count_memcg_events(struct mem_cgr
 	if (mem_cgroup_disabled())
 		return;
 
+	__this_cpu_add(memcg->vmstats_local->events[idx], count);
+
 	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
 		struct mem_cgroup *mi;
 
-		atomic_long_add(x, &memcg->vmevents_local[idx]);
 		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 			atomic_long_add(x, &mi->vmevents[idx]);
 		x = 0;
@@ -790,7 +793,12 @@ static unsigned long memcg_events(struct
 
 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 {
-	return atomic_long_read(&memcg->vmevents_local[event]);
+	long x = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		x += per_cpu(memcg->vmstats_local->events[event], cpu);
+	return x;
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -2191,11 +2199,9 @@ static int memcg_hotplug_cpu_dead(unsign
 			long x;
 
 			x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
-			if (x) {
-				atomic_long_add(x, &memcg->vmstats_local[i]);
+			if (x)
 				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 					atomic_long_add(x, &memcg->vmstats[i]);
-			}
 
 			if (i >= NR_VM_NODE_STAT_ITEMS)
 				continue;
@@ -2205,12 +2211,10 @@ static int memcg_hotplug_cpu_dead(unsign
 
 				pn = mem_cgroup_nodeinfo(memcg, nid);
 				x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
-				if (x) {
-					atomic_long_add(x, &pn->lruvec_stat_local[i]);
+				if (x)
 					do {
 						atomic_long_add(x, &pn->lruvec_stat[i]);
 					} while ((pn = parent_nodeinfo(pn, nid)));
-				}
 			}
 		}
 
@@ -2218,11 +2222,9 @@ static int memcg_hotplug_cpu_dead(unsign
 			long x;
 
 			x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
-			if (x) {
-				atomic_long_add(x, &memcg->vmevents_local[i]);
+			if (x)
 				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 					atomic_long_add(x, &memcg->vmevents[i]);
-			}
 		}
 	}
 
@@ -4483,8 +4485,15 @@ static int alloc_mem_cgroup_per_node_inf
 	if (!pn)
 		return 1;
 
+	pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+	if (!pn->lruvec_stat_local) {
+		kfree(pn);
+		return 1;
+	}
+
 	pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
 	if (!pn->lruvec_stat_cpu) {
+		free_percpu(pn->lruvec_stat_local);
 		kfree(pn);
 		return 1;
 	}
@@ -4506,6 +4515,7 @@ static void free_mem_cgroup_per_node_inf
 		return;
 
 	free_percpu(pn->lruvec_stat_cpu);
+	free_percpu(pn->lruvec_stat_local);
 	kfree(pn);
 }
 
@@ -4516,6 +4526,7 @@ static void __mem_cgroup_free(struct mem
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	free_percpu(memcg->vmstats_percpu);
+	free_percpu(memcg->vmstats_local);
 	kfree(memcg);
 }
 
@@ -4544,6 +4555,10 @@ static struct mem_cgroup *mem_cgroup_all
 	if (memcg->id.id < 0)
 		goto fail;
 
+	memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+	if (!memcg->vmstats_local)
+		goto fail;
+
 	memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
 	if (!memcg->vmstats_percpu)
 		goto fail;
_