[PATCH v2 3/5] memcg: make it suck faster

Glauber Costa <glommer@xxxxxxxxxxxxx> · Tue, 5 Mar 2013 17:10:56 +0400

It is an accepted fact that memcg sucks. But can it suck faster?  Or in
a more fair statement, can it at least stop draining everyone's
performance when it is not in use?

This experimental and slightly crude patch demonstrates that we can do
that by using static branches to patch it out until the first memcg
comes to life. There are edges to be trimmed, and I appreciate comments
for direction. In particular, the events in the root are not fired, but
I believe this can be done without further problems by calling a
specialized event check from mem_cgroup_newpage_charge().

My goal was to have enough numbers to demonstrate the performance gain
that can come from it. I tested it in a 24-way 2-socket Intel box, 24 Gb
mem. I used Mel Gorman's pft test, that he used to demonstrate this
problem back in the Kernel Summit. There are three kernels:

nomemcg  : memcg compile disabled.
base     : memcg enabled, patch not applied.
bypassed : memcg enabled, with patch applied.

                base    bypassed
User          109.12      105.64
System       1646.84     1597.98
Elapsed       229.56      215.76

             nomemcg    bypassed
User          104.35      105.64
System       1578.19     1597.98
Elapsed       212.33      215.76

So as one can see, the difference between base and nomemcg in terms
of both system time and elapsed time is quite drastic, and consistent
with the figures shown by Mel Gorman in the Kernel summit. This is a
~ 7 % drop in performance, just by having memcg enabled. memcg functions
appear heavily in the profiles, even if all tasks lives in the root
memcg.

With bypassed kernel, we drop this down to 1.5 %, which starts to fall
in the acceptable range. More investigation is needed to see if we can
claim that last percent back, but I believe at last part of it should
be.

Signed-off-by: Glauber Costa <glommer@xxxxxxxxxxxxx>
CC: Michal Hocko <mhocko@xxxxxxx>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
CC: Johannes Weiner <hannes@xxxxxxxxxxx>
CC: Mel Gorman <mgorman@xxxxxxx>
CC: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---
 include/linux/memcontrol.h |  72 ++++++++++++++++----
 mm/memcontrol.c            | 166 +++++++++++++++++++++++++++++++++++++++++----
 mm/page_cgroup.c           |   4 +-
 3 files changed, 216 insertions(+), 26 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d6183f0..009f925 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -42,6 +42,26 @@ struct mem_cgroup_reclaim_cookie {
 };
 
 #ifdef CONFIG_MEMCG
+extern struct static_key memcg_in_use_key;
+
+static inline bool mem_cgroup_subsys_disabled(void)
+{
+	return !!mem_cgroup_subsys.disabled;
+}
+
+static inline bool mem_cgroup_disabled(void)
+{
+	/*
+	 * Will always be false if subsys is disabled, because we have no one
+	 * to bump it up. So the test suffices and we don't have to test the
+	 * subsystem as well
+	 */
+	if (!static_key_false(&memcg_in_use_key))
+		return true;
+	return false;
+}
+
+
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
  * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
@@ -53,8 +73,18 @@ struct mem_cgroup_reclaim_cookie {
  * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
  */
 
-extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
+extern int __mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
+
+static inline int
+mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+	return __mem_cgroup_newpage_charge(page, mm, gfp_mask);
+}
+
 /* for swap handling */
 extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 		struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
@@ -62,8 +92,17 @@ extern void mem_cgroup_commit_charge_swapin(struct page *page,
 					struct mem_cgroup *memcg);
 extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
 
-extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-					gfp_t gfp_mask);
+
+extern int __mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+				     gfp_t gfp_mask);
+static inline int
+mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	return __mem_cgroup_cache_charge(page, mm, gfp_mask);
+}
 
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@ -72,8 +111,24 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 extern void mem_cgroup_uncharge_start(void);
 extern void mem_cgroup_uncharge_end(void);
 
-extern void mem_cgroup_uncharge_page(struct page *page);
-extern void mem_cgroup_uncharge_cache_page(struct page *page);
+extern void __mem_cgroup_uncharge_page(struct page *page);
+extern void __mem_cgroup_uncharge_cache_page(struct page *page);
+
+static inline void mem_cgroup_uncharge_page(struct page *page)
+{
+	if (mem_cgroup_disabled())
+		return;
+
+	__mem_cgroup_uncharge_page(page);
+}
+
+static inline void mem_cgroup_uncharge_cache_page(struct page *page)
+{
+	if (mem_cgroup_disabled())
+		return;
+
+	__mem_cgroup_uncharge_cache_page(page);
+}
 
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg);
@@ -128,13 +183,6 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 extern int do_swap_account;
 #endif
 
-static inline bool mem_cgroup_disabled(void)
-{
-	if (mem_cgroup_subsys.disabled)
-		return true;
-	return false;
-}
-
 void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
 					 unsigned long *flags);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bfbf1c2..45c1886 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -575,6 +575,9 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 	return (memcg == root_mem_cgroup);
 }
 
+static bool memcg_charges_allowed = false;
+struct static_key memcg_in_use_key;
+
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
@@ -710,6 +713,7 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
 {
 	disarm_sock_keys(memcg);
 	disarm_kmem_keys(memcg);
+	static_key_slow_dec(&memcg_in_use_key);
 }
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -1109,6 +1113,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 	if (unlikely(!p))
 		return NULL;
 
+	if (mem_cgroup_disabled())
+		return root_mem_cgroup;
+
 	return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
 }
 
@@ -1157,9 +1164,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 	struct mem_cgroup *memcg = NULL;
 	int id = 0;
 
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_subsys_disabled())
 		return NULL;
 
+	if (mem_cgroup_disabled())
+		return root_mem_cgroup;
+
 	if (!root)
 		root = root_mem_cgroup;
 
@@ -1335,6 +1345,20 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 	memcg = pc->mem_cgroup;
 
 	/*
+	 * Because we lazily enable memcg only after first child group is
+	 * created, we can have memcg == 0. Because page cgroup is created with
+	 * GFP_ZERO, and after charging, all page cgroups will have a non-zero
+	 * cgroup attached (even if root), we can be sure that this is a
+	 * used-but-not-accounted page. (due to lazyness). We could get around
+	 * that by scanning all pages on cgroup init is too expensive. We can
+	 * ultimately pay, but prefer to just to defer the update until we get
+	 * here. We could take the opportunity to set PageCgroupUsed, but it
+	 * won't be that important for the root cgroup.
+	 */
+	if (!memcg && PageLRU(page))
+		pc->mem_cgroup = memcg = root_mem_cgroup;
+
+	/*
 	 * Surreptitiously switch any uncharged offlist page to root:
 	 * an uncharged page off lru does nothing to secure
 	 * its former mem_cgroup from sudden removal.
@@ -3845,11 +3869,18 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	return 0;
 }
 
-int mem_cgroup_newpage_charge(struct page *page,
+int __mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
-	if (mem_cgroup_disabled())
+	/*
+	 * The branch is actually very likely before the first memcg comes in.
+	 * But since the code is patched out, we'll never reach it. It is only
+	 * reachable when the code is patched in, and in that case it is
+	 * unlikely.  It will only happen during initial charges move.
+	 */
+	if (unlikely(!memcg_charges_allowed))
 		return 0;
+
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping && !PageAnon(page));
 	VM_BUG_ON(!mm);
@@ -3962,15 +3993,13 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
 					  MEM_CGROUP_CHARGE_TYPE_ANON);
 }
 
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask)
+int __mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+			      gfp_t gfp_mask)
 {
 	struct mem_cgroup *memcg = NULL;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	int ret;
 
-	if (mem_cgroup_disabled())
-		return 0;
 	if (PageCompound(page))
 		return 0;
 
@@ -4050,9 +4079,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
 	struct page_cgroup *pc;
 	bool anon;
 
-	if (mem_cgroup_disabled())
-		return NULL;
-
 	VM_BUG_ON(PageSwapCache(page));
 
 	if (PageTransHuge(page)) {
@@ -4144,7 +4170,7 @@ unlock_out:
 	return NULL;
 }
 
-void mem_cgroup_uncharge_page(struct page *page)
+void __mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
@@ -4155,7 +4181,7 @@ void mem_cgroup_uncharge_page(struct page *page)
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 
-void mem_cgroup_uncharge_cache_page(struct page *page)
+void __mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
@@ -4220,6 +4246,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 	struct mem_cgroup *memcg;
 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 
+	if (mem_cgroup_disabled())
+		return;
+
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 
@@ -6364,6 +6393,59 @@ free_out:
 	return ERR_PTR(error);
 }
 
+static void memcg_update_root_statistics(void)
+{
+	int cpu;
+	u64 pgin, pgout, faults, mjfaults;
+
+	pgin = pgout = faults = mjfaults = 0;
+	for_each_online_cpu(cpu) {
+		struct vm_event_state *ev = &per_cpu(vm_event_states, cpu);
+		struct mem_cgroup_stat_cpu *memcg_stat;
+
+		memcg_stat = per_cpu_ptr(root_mem_cgroup->stat, cpu);
+
+		memcg_stat->events[MEM_CGROUP_EVENTS_PGPGIN] =
+							ev->event[PGPGIN];
+		memcg_stat->events[MEM_CGROUP_EVENTS_PGPGOUT] =
+							ev->event[PGPGOUT];
+		memcg_stat->events[MEM_CGROUP_EVENTS_PGFAULT] =
+							ev->event[PGFAULT];
+		memcg_stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT] =
+							ev->event[PGMAJFAULT];
+
+		memcg_stat->nr_page_events = ev->event[PGPGIN] +
+					     ev->event[PGPGOUT];
+	}
+
+	root_mem_cgroup->nocpu_base.count[MEM_CGROUP_STAT_RSS] =
+				memcg_read_root_rss();
+	root_mem_cgroup->nocpu_base.count[MEM_CGROUP_STAT_CACHE] =
+				atomic_long_read(&vm_stat[NR_FILE_PAGES]);
+	root_mem_cgroup->nocpu_base.count[MEM_CGROUP_STAT_FILE_MAPPED] =
+				atomic_long_read(&vm_stat[NR_FILE_MAPPED]);
+}
+
+static void memcg_update_root_lru(void)
+{
+	struct zone *zone;
+	struct lruvec *lruvec;
+	struct mem_cgroup_per_zone *mz;
+	enum lru_list lru;
+
+	for_each_populated_zone(zone) {
+		spin_lock_irq(&zone->lru_lock);
+		lruvec = &zone->lruvec;
+		mz = mem_cgroup_zoneinfo(root_mem_cgroup,
+				zone_to_nid(zone), zone_idx(zone));
+
+		for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+			mz->lru_size[lru] =
+				zone_page_state(zone, NR_LRU_BASE + lru);
+		spin_unlock_irq(&zone->lru_lock);
+	}
+}
+
 static int
 mem_cgroup_css_online(struct cgroup *cont)
 {
@@ -6407,6 +6489,66 @@ mem_cgroup_css_online(struct cgroup *cont)
 	}
 
 	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+
+	if (!error) {
+		static_key_slow_inc(&memcg_in_use_key);
+		/*
+		 * The strategy to avoid races here is to let the charges just
+		 * be globally made until we lock the res counter. Since we are
+		 * copying charges from global statistics, it doesn't really
+		 * matter when we do it, as long as we are consistent. So even
+		 * after the code is patched in, they will continue being
+		 * globally charged due to memcg_charges_allowed being set to
+		 * false.
+		 *
+		 * Once we hold the res counter lock, though, we can already
+		 * safely flip it: We will go through with the charging to the
+		 * root memcg, but won't be able to actually charge it: we have
+		 * the lock.
+		 *
+		 * This works because the mm stats are only updated after the
+		 * memcg charging suceeds. If we block the charge by holding
+		 * the res_counter lock, no other charges will happen in the
+		 * system until we release it.
+		 *
+		 * manipulation always safe because the write side is always
+		 * under the memcg_mutex.
+		 */
+		if (!memcg_charges_allowed) {
+			struct zone *zone;
+
+			get_online_cpus();
+			spin_lock(&root_mem_cgroup->res.lock);
+
+			memcg_charges_allowed = true;
+
+			root_mem_cgroup->res.usage =
+				mem_cgroup_read_root(RES_USAGE, _MEM);
+			root_mem_cgroup->memsw.usage =
+				mem_cgroup_read_root(RES_USAGE, _MEMSWAP);
+			/*
+			 * The max usage figure is not entirely accurate. The
+			 * memory may have been higher in the past. But since
+			 * we don't track that globally, this is the best we
+			 * can do.
+			 */
+			root_mem_cgroup->res.max_usage =
+					root_mem_cgroup->res.usage;
+			root_mem_cgroup->memsw.max_usage =
+					root_mem_cgroup->memsw.usage;
+
+			memcg_update_root_statistics();
+			memcg_update_root_lru();
+			/*
+			 * We are now 100 % consistent and all charges are
+			 * transfered.  New charges should reach the
+			 * res_counter directly.
+			 */
+			spin_unlock(&root_mem_cgroup->res.lock);
+			put_online_cpus();
+		}
+	}
+
 	mutex_unlock(&memcg_create_mutex);
 	if (error) {
 		/*
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3..a5bd322 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -68,7 +68,7 @@ void __init page_cgroup_init_flatmem(void)
 
 	int nid, fail;
 
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_subsys_disabled())
 		return;
 
 	for_each_online_node(nid)  {
@@ -271,7 +271,7 @@ void __init page_cgroup_init(void)
 	unsigned long pfn;
 	int nid;
 
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_subsys_disabled())
 		return;
 
 	for_each_node_state(nid, N_MEMORY) {
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe cgroups" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html