[patch 032/142] slab: link memcg kmem_caches on their associated memory cgroup

akpm@xxxxxxxxxxxxxxxxxxxx · Wed, 22 Feb 2017 15:41:21 -0800

From: Tejun Heo <tj@xxxxxxxxxx>
Subject: slab: link memcg kmem_caches on their associated memory cgroup

With kmem cgroup support enabled, kmem_caches can be created and destroyed
frequently and a great number of near empty kmem_caches can accumulate if
there are a lot of transient cgroups and the system is not under memory
pressure.  When memory reclaim starts under such conditions, it can lead
to consecutive deactivation and destruction of many kmem_caches, easily
hundreds of thousands on moderately large systems, exposing scalability
issues in the current slab management code.  This is one of the patches to
address the issue.

While a memcg kmem_cache is listed on its root cache's ->children list,
there is no direct way to iterate all kmem_caches which are assocaited
with a memory cgroup.  The only way to iterate them is walking all caches
while filtering out caches which don't match, which would be most of them.

This makes memcg destruction operations O(N^2) where N is the total number
of slab caches which can be huge.  This combined with the synchronous RCU
operations can tie up a CPU and affect the whole machine for many hours
when memory reclaim triggers offlining and destruction of the stale
memcgs.

This patch adds mem_cgroup->kmem_caches list which goes through
memcg_cache_params->kmem_caches_node of all kmem_caches which are
associated with the memcg.  All memcg specific iterations, including stat
file access, are updated to use the new list instead.

Link: http://lkml.kernel.org/r/20170117235411.9408-6-tj@xxxxxxxxxx
Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Reported-by: Jay Vana <jsvana@xxxxxx>
Acked-by: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Pekka Enberg <penberg@xxxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 include/linux/memcontrol.h |    1 
 include/linux/slab.h       |    3 ++
 mm/memcontrol.c            |    7 +++---
 mm/slab.h                  |    3 ++
 mm/slab_common.c           |   36 ++++++++++++++++++++++++++++-------
 5 files changed, 40 insertions(+), 10 deletions(-)

diff -puN include/linux/memcontrol.h~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup include/linux/memcontrol.h

--- a/include/linux/memcontrol.h~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup
+++ a/include/linux/memcontrol.h
@@ -253,6 +253,7 @@ struct mem_cgroup {
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 	enum memcg_kmem_state kmem_state;
+	struct list_head kmem_caches;
 #endif
 
 	int last_scanned_node;
diff -puN include/linux/slab.h~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup include/linux/slab.h
--- a/include/linux/slab.h~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup
+++ a/include/linux/slab.h
@@ -565,6 +565,8 @@ struct memcg_cache_array {
  * @memcg:	Pointer to the memcg this cache belongs to.
  *
  * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
  */
 struct memcg_cache_params {
 	struct kmem_cache *root_cache;
@@ -576,6 +578,7 @@ struct memcg_cache_params {
 		struct {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
+			struct list_head kmem_caches_node;
 		};
 	};
 };
diff -puN mm/memcontrol.c~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup mm/memcontrol.c
--- a/mm/memcontrol.c~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup
+++ a/mm/memcontrol.c
@@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_
 	 */
 	memcg->kmemcg_id = memcg_id;
 	memcg->kmem_state = KMEM_ONLINE;
+	INIT_LIST_HEAD(&memcg->kmem_caches);
 
 	return 0;
 }
@@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_f
 #ifdef CONFIG_SLABINFO
 	{
 		.name = "kmem.slabinfo",
-		.seq_start = slab_start,
-		.seq_next = slab_next,
-		.seq_stop = slab_stop,
+		.seq_start = memcg_slab_start,
+		.seq_next = memcg_slab_next,
+		.seq_stop = memcg_slab_stop,
 		.seq_show = memcg_slab_show,
 	},
 #endif
diff -puN mm/slab.h~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup mm/slab.h
--- a/mm/slab.h~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup
+++ a/mm/slab.h
@@ -494,6 +494,9 @@ static inline struct kmem_cache_node *ge
 void *slab_start(struct seq_file *m, loff_t *pos);
 void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
+void *memcg_slab_start(struct seq_file *m, loff_t *pos);
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
+void memcg_slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
diff -puN mm/slab_common.c~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup mm/slab_common.c
--- a/mm/slab_common.c~slab-link-memcg-kmem_caches-on-their-associated-memory-cgroup
+++ a/mm/slab_common.c
@@ -154,6 +154,7 @@ static int init_memcg_params(struct kmem
 		s->memcg_params.root_cache = root_cache;
 		s->memcg_params.memcg = memcg;
 		INIT_LIST_HEAD(&s->memcg_params.children_node);
+		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
 		return 0;
 	}
 
@@ -224,6 +225,7 @@ int memcg_update_all_caches(int num_memc
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
 	list_del(&s->memcg_params.children_node);
+	list_del(&s->memcg_params.kmem_caches_node);
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -596,6 +598,7 @@ void memcg_create_kmem_cache(struct mem_
 
 	list_add(&s->memcg_params.children_node,
 		 &root_cache->memcg_params.children);
+	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
 
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -651,9 +654,8 @@ void memcg_destroy_kmem_caches(struct me
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry_safe(s, s2, &slab_caches, list) {
-		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
-			continue;
+	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
+				 memcg_params.kmem_caches_node) {
 		/*
 		 * The cgroup is about to be freed and therefore has no charges
 		 * left. Hence, all its caches must be empty by now.
@@ -1201,15 +1203,35 @@ static int slab_show(struct seq_file *m,
 }
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+void *memcg_slab_start(struct seq_file *m, loff_t *pos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	mutex_lock(&slab_mutex);
+	return seq_list_start(&memcg->kmem_caches, *pos);
+}
+
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	return seq_list_next(p, &memcg->kmem_caches, pos);
+}
+
+void memcg_slab_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&slab_mutex);
+}
+
 int memcg_slab_show(struct seq_file *m, void *p)
 {
-	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+	struct kmem_cache *s = list_entry(p, struct kmem_cache,
+					  memcg_params.kmem_caches_node);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
-	if (p == slab_caches.next)
+	if (p == memcg->kmem_caches.next)
 		print_slabinfo_header(m);
-	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 #endif
_
--
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html