+ mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch added to -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Thu, 17 Dec 2015 15:02:35 -0800

The patch titled
     Subject: mm: memcontrol: fix possible memcg leak due to interrupted reclaim
has been added to the -mm tree.  Its filename is
     mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch

This patch should soon appear at
    http://ozlabs.org/~akpm/mmots/broken-out/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
and later at
    http://ozlabs.org/~akpm/mmotm/broken-out/mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/SubmitChecklist when testing your code ***

The -mm tree is included into linux-next and is updated
there every 3-4 working days

------------------------------------------------------
From: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
Subject: mm: memcontrol: fix possible memcg leak due to interrupted reclaim

Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break() once
enough pages have been reclaimed, in which case, in contrast to a full
round-trip over a cgroup sub-tree, the current position stored in
mem_cgroup_reclaim_iter of the target cgroup does not get invalidated and
so is left holding the reference to the last scanned cgroup.  If the
target cgroup does not get scanned again (we might have just reclaimed the
last page or all processes might exit and free their memory voluntary), we
will leak it, because there is nobody to put the reference held by the
iterator.

The problem is easy to reproduce by running the following command sequence
in a loop:

    mkdir /sys/fs/cgroup/memory/test
    echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
    memhog 150M
    echo $$ > /sys/fs/cgroup/memory/cgroup.procs
    rmdir test

The cgroups generated by it will never get freed.

This patch fixes this issue by making mem_cgroup_iter avoid taking
reference to the current position.  In order not to hit use-after-free bug
while running reclaim in parallel with cgroup deletion, we make use of
->css_released cgroup callback to clear references to the dying cgroup in
all reclaim iterators that might refer to it.  This callback is called
right before scheduling rcu work which will free css, so if we access
iter->position from rcu read section, we might be sure it won't go away
under us.

Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
Acked-by: Michal Hocko <mhocko@xxxxxxxxxx>
Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx>	[3.19+]
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 mm/memcontrol.c |   53 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff -puN mm/memcontrol.c~mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim mm/memcontrol.c

--- a/mm/memcontrol.c~mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim
+++ a/mm/memcontrol.c
@@ -903,14 +903,20 @@ struct mem_cgroup *mem_cgroup_iter(struc
 		if (prev && reclaim->generation != iter->generation)
 			goto out_unlock;
 
-		do {
+		while (1) {
 			pos = READ_ONCE(iter->position);
+			if (!pos || css_tryget(&pos->css))
+				break;
 			/*
-			 * A racing update may change the position and
-			 * put the last reference, hence css_tryget(),
-			 * or retry to see the updated position.
+			 * css reference reached zero, so iter->position will
+			 * be cleared by ->css_released. However, we should not
+			 * rely on this happening soon, because ->css_released
+			 * is called from a work queue, and by busy-waiting we
+			 * might block it. So we clear iter->position right
+			 * away.
 			 */
-		} while (pos && !css_tryget(&pos->css));
+			cmpxchg(&iter->position, pos, NULL);
+		}
 	}
 
 	if (pos)
@@ -956,12 +962,7 @@ struct mem_cgroup *mem_cgroup_iter(struc
 	}
 
 	if (reclaim) {
-		if (cmpxchg(&iter->position, pos, memcg) == pos) {
-			if (memcg)
-				css_get(&memcg->css);
-			if (pos)
-				css_put(&pos->css);
-		}
+		cmpxchg(&iter->position, pos, memcg);
 
 		/*
 		 * pairs with css_tryget when dereferencing iter->position
@@ -999,6 +1000,28 @@ void mem_cgroup_iter_break(struct mem_cg
 		css_put(&prev->css);
 }
 
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+	struct mem_cgroup *memcg = dead_memcg;
+	struct mem_cgroup_reclaim_iter *iter;
+	struct mem_cgroup_per_zone *mz;
+	int nid, zid;
+	int i;
+
+	while ((memcg = parent_mem_cgroup(memcg))) {
+		for_each_node(nid) {
+			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+				for (i = 0; i <= DEF_PRIORITY; i++) {
+					iter = &mz->iter[i];
+					cmpxchg(&iter->position,
+						dead_memcg, NULL);
+				}
+			}
+		}
+	}
+}
+
 /*
  * Iteration constructs for visiting all cgroups (under a tree).  If
  * loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -4324,6 +4347,13 @@ static void mem_cgroup_css_offline(struc
 	wb_memcg_offline(memcg);
 }
 
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	invalidate_reclaim_iterators(memcg);
+}
+
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5185,6 +5215,7 @@ struct cgroup_subsys memory_cgrp_subsys
 	.css_alloc = mem_cgroup_css_alloc,
 	.css_online = mem_cgroup_css_online,
 	.css_offline = mem_cgroup_css_offline,
+	.css_released = mem_cgroup_css_released,
 	.css_free = mem_cgroup_css_free,
 	.css_reset = mem_cgroup_css_reset,
 	.can_attach = mem_cgroup_can_attach,
_

Patches currently in -mm which might be from vdavydov@xxxxxxxxxxxxx are

mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim.patch
revert-kernfs-do-not-account-ino_ida-allocations-to-memcg.patch
revert-gfp-add-__gfp_noaccount.patch
memcg-only-account-kmem-allocations-marked-as-__gfp_account.patch
slab-add-slab_account-flag.patch
vmalloc-allow-to-account-vmalloc-to-memcg.patch
account-certain-kmem-allocations-to-memcg.patch
vmscan-do-not-force-scan-file-lru-if-its-absolute-size-is-small.patch
vmscan-do-not-force-scan-file-lru-if-its-absolute-size-is-small-v2.patch
memcg-do-not-allow-to-disable-tcp-accounting-after-limit-is-set.patch
mm-add-page_check_address_transhuge-helper.patch
mm-add-page_check_address_transhuge-helper-fix.patch
mm-memcontrol-allow-to-disable-kmem-accounting-for-cgroup2.patch
net-drop-tcp_memcontrolc.patch
mm-memcontrol-charge-swap-to-cgroup2.patch
mm-vmscan-pass-memcg-to-get_scan_count.patch
mm-memcontrol-replace-mem_cgroup_lruvec_online-with-mem_cgroup_online.patch
swaph-move-memcg-related-stuff-to-the-end-of-the-file.patch
mm-vmscan-do-not-scan-anon-pages-if-memcg-swap-limit-is-hit.patch
mm-free-swap-cache-aggressively-if-memcg-swap-is-full.patch
documentation-cgroup-add-memoryswapcurrentmax-description.patch

--
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html