[to-be-updated] writeback-sum-memcg-dirty-counters-as-needed.patch removed from -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Wed, 27 Mar 2019 15:27:57 -0700

The patch titled
     Subject: writeback: sum memcg dirty counters as needed
has been removed from the -mm tree.  Its filename was
     writeback-sum-memcg-dirty-counters-as-needed.patch

This patch was dropped because an updated version will be merged

------------------------------------------------------
From: Greg Thelen <gthelen@xxxxxxxxxx>
Subject: writeback: sum memcg dirty counters as needed

Since a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed as:

1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter

When a per-cpu counter cannot fit in [-32..32] it's flushed to the atomic.
Stat readers only check the atomic.  Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per cpu.

Assuming 100 cpus:
   4k x86 page_size:  13 MiB error per memcg
  64k ppc page_size: 200 MiB error per memcg

Considering that dirty+writeback are used together for some decisions the
errors double.

This inaccuracy can lead to undeserved oom kills.  One nasty case is when
all per-cpu counters hold positive values offsetting an atomic negative
value (i.e.  per_cpu[*]=32, atomic=n_cpu*-32).  balance_dirty_pages() only
consults the atomic and does not consider throttling the next n_cpu*32
dirty pages.  If the file_lru is in the 13..200 MiB range then there's
absolutely no dirty throttling, which burdens vmscan with only
dirty+writeback pages thus resorting to oom kill.

It could be argued that tiny containers are not supported, but it's more
subtle.  It's the amount the space available for file lru that matters. 
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.

The following test reliably ooms without this patch.  This patch avoids
oom kills.

  $ cat test
  mount -t cgroup2 none /dev/cgroup
  cd /dev/cgroup
  echo +io +memory > cgroup.subtree_control
  mkdir test
  cd test
  echo 10M > memory.max
  (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
  (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)

  $ cat memcg-writeback-stress.c
  /*
   * Dirty pages from all but one cpu.
   * Clean pages from the non dirtying cpu.
   * This is to stress per cpu counter imbalance.
   * On a 100 cpu machine:
   * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
   * - per memcg atomic is -99*32 pages
   * - thus the complete dirty limit: sum of all counters 0
   * - balance_dirty_pages() only sees atomic count -99*32 pages, which
   *   it max()s to 0.
   * - So a workload can dirty -99*32 pages before balance_dirty_pages()
   *   cares.
   */
  #define _GNU_SOURCE
  #include <err.h>
  #include <fcntl.h>
  #include <sched.h>
  #include <stdlib.h>
  #include <stdio.h>
  #include <sys/stat.h>
  #include <sys/sysinfo.h>
  #include <sys/types.h>
  #include <unistd.h>

  static char *buf;
  static int bufSize;

  static void set_affinity(int cpu)
  {
  	cpu_set_t affinity;

  	CPU_ZERO(&affinity);
  	CPU_SET(cpu, &affinity);
  	if (sched_setaffinity(0, sizeof(affinity), &affinity))
  		err(1, "sched_setaffinity");
  }

  static void dirty_on(int output_fd, int cpu)
  {
  	int i, wrote;

  	set_affinity(cpu);
  	for (i = 0; i < 32; i++) {
  		for (wrote = 0; wrote < bufSize; ) {
  			int ret = write(output_fd, buf+wrote, bufSize-wrote);
  			if (ret == -1)
  				err(1, "write");
  			wrote += ret;
  		}
  	}
  }

  int main(int argc, char **argv)
  {
  	int cpu, flush_cpu = 1, output_fd;
  	const char *output;

  	if (argc != 2)
  		errx(1, "usage: output_file");

  	output = argv[1];
  	bufSize = getpagesize();
  	buf = malloc(getpagesize());
  	if (buf == NULL)
  		errx(1, "malloc failed");

  	output_fd = open(output, O_CREAT|O_RDWR);
  	if (output_fd == -1)
  		err(1, "open(%s)", output);

  	for (cpu = 0; cpu < get_nprocs(); cpu++) {
  		if (cpu != flush_cpu)
  			dirty_on(output_fd, cpu);
  	}

  	set_affinity(flush_cpu);
  	if (fsync(output_fd))
  		err(1, "fsync(%s)", output);
  	if (close(output_fd))
  		err(1, "close(%s)", output);
  	free(buf);
  }

Make balance_dirty_pages() and wb_over_bg_thresh() work harder to collect
exact per memcg counters when a memcg is close to the throttling/writeback
threshold.  This avoids the aforementioned oom kills.

This does not affect the overhead of memory.stat, which still reads the
single atomic counter.

Why not use percpu_counter?  memcg already handles cpus going offline, so
no need for that overhead from percpu_counter.  And the percpu_counter
spinlocks are more heavyweight than is required.

It probably also makes sense to include exact dirty and writeback counters
in memcg oom reports.  But that is saved for later.

Link: http://lkml.kernel.org/r/20190307165632.35810-1-gthelen@xxxxxxxxxx
Signed-off-by: Greg Thelen <gthelen@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Roman Gushchin <guro@xxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 include/linux/memcontrol.h |   33 +++++++++++++++++++++++++--------
 mm/memcontrol.c            |   26 ++++++++++++++++++++------
 mm/page-writeback.c        |   27 +++++++++++++++++++++------
 3 files changed, 66 insertions(+), 20 deletions(-)

--- a/include/linux/memcontrol.h~writeback-sum-memcg-dirty-counters-as-needed
+++ a/include/linux/memcontrol.h
@@ -579,6 +579,22 @@ static inline unsigned long memcg_page_s
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
+static inline unsigned long
+memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+	long x = atomic_long_read(&memcg->stat[idx]);
+#ifdef CONFIG_SMP
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
 				     int idx, int val)
 {
@@ -1238,9 +1254,10 @@ static inline void dec_lruvec_page_state
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
-			 unsigned long *pheadroom, unsigned long *pdirty,
-			 unsigned long *pwriteback);
+unsigned long
+mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+		    unsigned long *pheadroom, unsigned long *pdirty,
+		    unsigned long *pwriteback, bool exact);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
@@ -1249,12 +1266,12 @@ static inline struct wb_domain *mem_cgro
 	return NULL;
 }
 
-static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
-				       unsigned long *pfilepages,
-				       unsigned long *pheadroom,
-				       unsigned long *pdirty,
-				       unsigned long *pwriteback)
+static inline unsigned long
+mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+		    unsigned long *pheadroom, unsigned long *pdirty,
+		    unsigned long *pwriteback, bool exact)
 {
+	return 0;
 }
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
--- a/mm/memcontrol.c~writeback-sum-memcg-dirty-counters-as-needed
+++ a/mm/memcontrol.c
@@ -3889,6 +3889,7 @@ struct wb_domain *mem_cgroup_wb_domain(s
  * @pheadroom: out parameter for number of allocatable pages according to memcg
  * @pdirty: out parameter for number of dirty pages
  * @pwriteback: out parameter for number of pages under writeback
+ * @exact: determines exact counters are required, indicates more work.
  *
  * Determine the numbers of file, headroom, dirty, and writeback pages in
  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
@@ -3899,18 +3900,29 @@ struct wb_domain *mem_cgroup_wb_domain(s
  * ancestors.  Note that this doesn't consider the actual amount of
  * available memory in the system.  The caller should further cap
  * *@pheadroom accordingly.
+ *
+ * Return value is the error precision associated with *@pdirty
+ * and *@pwriteback.  When @exact is set this a minimal value.
  */
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
-			 unsigned long *pheadroom, unsigned long *pdirty,
-			 unsigned long *pwriteback)
+unsigned long
+mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+		    unsigned long *pheadroom, unsigned long *pdirty,
+		    unsigned long *pwriteback, bool exact)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
+	unsigned long precision;
 
-	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
-
+	if (exact) {
+		precision = 0;
+		*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+		*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+	} else {
+		precision = MEMCG_CHARGE_BATCH * num_online_cpus();
+		*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+		*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+	}
 	/* this should eventually include NR_UNSTABLE_NFS */
-	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
 	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 						     (1 << LRU_ACTIVE_FILE));
 	*pheadroom = PAGE_COUNTER_MAX;
@@ -3922,6 +3934,8 @@ void mem_cgroup_wb_stats(struct bdi_writ
 		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
 		memcg = parent;
 	}
+
+	return precision;
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
--- a/mm/page-writeback.c~writeback-sum-memcg-dirty-counters-as-needed
+++ a/mm/page-writeback.c
@@ -1612,14 +1612,17 @@ static void balance_dirty_pages(struct b
 		}
 
 		if (mdtc) {
-			unsigned long filepages, headroom, writeback;
+			bool exact = false;
+			unsigned long precision, filepages, headroom, writeback;
 
 			/*
 			 * If @wb belongs to !root memcg, repeat the same
 			 * basic calculations for the memcg domain.
 			 */
-			mem_cgroup_wb_stats(wb, &filepages, &headroom,
-					    &mdtc->dirty, &writeback);
+memcg_stats:
+			precision = mem_cgroup_wb_stats(wb, &filepages,
+							&headroom, &mdtc->dirty,
+							&writeback, exact);
 			mdtc->dirty += writeback;
 			mdtc_calc_avail(mdtc, filepages, headroom);
 
@@ -1634,6 +1637,10 @@ static void balance_dirty_pages(struct b
 				m_dirty = mdtc->dirty;
 				m_thresh = mdtc->thresh;
 				m_bg_thresh = mdtc->bg_thresh;
+				if (abs(m_dirty - mdtc->thresh) < precision) {
+					exact = true;
+					goto memcg_stats;
+				}
 			}
 		}
 
@@ -1947,10 +1954,13 @@ bool wb_over_bg_thresh(struct bdi_writeb
 		return true;
 
 	if (mdtc) {
-		unsigned long filepages, headroom, writeback;
+		bool exact = false;
+		unsigned long precision, filepages, headroom, writeback;
 
-		mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
-				    &writeback);
+memcg_stats:
+		precision = mem_cgroup_wb_stats(wb, &filepages, &headroom,
+						&mdtc->dirty, &writeback,
+						exact);
 		mdtc_calc_avail(mdtc, filepages, headroom);
 		domain_dirty_limits(mdtc);	/* ditto, ignore writeback */
 
@@ -1960,6 +1970,11 @@ bool wb_over_bg_thresh(struct bdi_writeb
 		if (wb_stat(wb, WB_RECLAIMABLE) >
 		    wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
 			return true;
+
+		if (abs(mdtc->dirty - mdtc->bg_thresh) < precision) {
+			exact = true;
+			goto memcg_stats;
+		}
 	}
 
 	return false;
_

Patches currently in -mm which might be from gthelen@xxxxxxxxxx are