On Thu 04-09-14 11:08:46, Johannes Weiner wrote: [...] > From 6fa7599054868cd0df940d7b0973dd64f8acb0b5 Mon Sep 17 00:00:00 2001 > From: Johannes Weiner <hannes@xxxxxxxxxxx> > Date: Thu, 4 Sep 2014 10:04:34 -0400 > Subject: [patch] mm: memcontrol: revert use of root_mem_cgroup res_counter > > Dave Hansen reports a massive scalability regression in an uncontained > page fault benchmark with more than 30 concurrent threads, which he > bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup > res_counter") and pin-pointed on res_counter spinlock contention. > > That change relied on the per-cpu charge caches to mostly swallow the > res_counter costs, but it's apparent that the caches don't scale yet. > > Revert memcg back to bypassing res_counters on the root level in order > to restore performance for uncontained workloads. > > Reported-by: Dave Hansen <dave@xxxxxxxx> > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> The revert looks good to me. Acked-by: Michal Hocko <mhocko@xxxxxxx> > --- > mm/memcontrol.c | 103 ++++++++++++++++++++++++++++++++++++++++++-------------- > 1 file changed, 78 insertions(+), 25 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index ec4dcf1b9562..085dc6d2f876 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, > unsigned long long size; > int ret = 0; > > + if (mem_cgroup_is_root(memcg)) > + goto done; > retry: > if (consume_stock(memcg, nr_pages)) > goto done; > @@ -2611,9 +2613,7 @@ nomem: > if (!(gfp_mask & __GFP_NOFAIL)) > return -ENOMEM; > bypass: > - memcg = root_mem_cgroup; > - ret = -EINTR; > - goto retry; > + return -EINTR; > > done_restock: > if (batch > nr_pages) > @@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) > { > unsigned long bytes = nr_pages * PAGE_SIZE; > > + if (mem_cgroup_is_root(memcg)) > + return; > + > res_counter_uncharge(&memcg->res, bytes); > if (do_swap_account) > res_counter_uncharge(&memcg->memsw, bytes); > @@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, > { > unsigned long bytes = nr_pages * PAGE_SIZE; > > + if (mem_cgroup_is_root(memcg)) > + return; > + > res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); > if (do_swap_account) > res_counter_uncharge_until(&memcg->memsw, > @@ -4093,6 +4099,46 @@ out: > return retval; > } > > +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, > + enum mem_cgroup_stat_index idx) > +{ > + struct mem_cgroup *iter; > + long val = 0; > + > + /* Per-cpu values can be negative, use a signed accumulator */ > + for_each_mem_cgroup_tree(iter, memcg) > + val += mem_cgroup_read_stat(iter, idx); > + > + if (val < 0) /* race ? */ > + val = 0; > + return val; > +} > + > +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) > +{ > + u64 val; > + > + if (!mem_cgroup_is_root(memcg)) { > + if (!swap) > + return res_counter_read_u64(&memcg->res, RES_USAGE); > + else > + return res_counter_read_u64(&memcg->memsw, RES_USAGE); > + } > + > + /* > + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS > + * as well as in MEM_CGROUP_STAT_RSS_HUGE. > + */ > + val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); > + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); > + > + if (swap) > + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); > + > + return val << PAGE_SHIFT; > +} > + > + > static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, > struct cftype *cft) > { > @@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, > > switch (type) { > case _MEM: > + if (name == RES_USAGE) > + return mem_cgroup_usage(memcg, false); > return res_counter_read_u64(&memcg->res, name); > case _MEMSWAP: > + if (name == RES_USAGE) > + return mem_cgroup_usage(memcg, true); > return res_counter_read_u64(&memcg->memsw, name); > case _KMEM: > return res_counter_read_u64(&memcg->kmem, name); > @@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) > if (!t) > goto unlock; > > - if (!swap) > - usage = res_counter_read_u64(&memcg->res, RES_USAGE); > - else > - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); > + usage = mem_cgroup_usage(memcg, swap); > > /* > * current_threshold points to threshold just below or equal to usage. > @@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, > > if (type == _MEM) { > thresholds = &memcg->thresholds; > - usage = res_counter_read_u64(&memcg->res, RES_USAGE); > + usage = mem_cgroup_usage(memcg, false); > } else if (type == _MEMSWAP) { > thresholds = &memcg->memsw_thresholds; > - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); > + usage = mem_cgroup_usage(memcg, true); > } else > BUG(); > > @@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, > > if (type == _MEM) { > thresholds = &memcg->thresholds; > - usage = res_counter_read_u64(&memcg->res, RES_USAGE); > + usage = mem_cgroup_usage(memcg, false); > } else if (type == _MEMSWAP) { > thresholds = &memcg->memsw_thresholds; > - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); > + usage = mem_cgroup_usage(memcg, true); > } else > BUG(); > > @@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) > * core guarantees its existence. > */ > } else { > - res_counter_init(&memcg->res, &root_mem_cgroup->res); > - res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); > - res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); > + res_counter_init(&memcg->res, NULL); > + res_counter_init(&memcg->memsw, NULL); > + res_counter_init(&memcg->kmem, NULL); > /* > * Deeper hierachy with use_hierarchy == false doesn't make > * much sense so let cgroup subsystem know about this > @@ -5969,8 +6016,9 @@ static void __mem_cgroup_clear_mc(void) > /* we must fixup refcnts and charges */ > if (mc.moved_swap) { > /* uncharge swap account from the old cgroup */ > - res_counter_uncharge(&mc.from->memsw, > - PAGE_SIZE * mc.moved_swap); > + if (!mem_cgroup_is_root(mc.from)) > + res_counter_uncharge(&mc.from->memsw, > + PAGE_SIZE * mc.moved_swap); > > for (i = 0; i < mc.moved_swap; i++) > css_put(&mc.from->css); > @@ -5979,8 +6027,9 @@ static void __mem_cgroup_clear_mc(void) > * we charged both to->res and to->memsw, so we should > * uncharge to->res. > */ > - res_counter_uncharge(&mc.to->res, > - PAGE_SIZE * mc.moved_swap); > + if (!mem_cgroup_is_root(mc.to)) > + res_counter_uncharge(&mc.to->res, > + PAGE_SIZE * mc.moved_swap); > /* we've already done css_get(mc.to) */ > mc.moved_swap = 0; > } > @@ -6345,7 +6394,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) > rcu_read_lock(); > memcg = mem_cgroup_lookup(id); > if (memcg) { > - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(memcg)) > + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); > mem_cgroup_swap_statistics(memcg, false); > css_put(&memcg->css); > } > @@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, > { > unsigned long flags; > > - if (nr_mem) > - res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); > - if (nr_memsw) > - res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); > - > - memcg_oom_recover(memcg); > + if (!mem_cgroup_is_root(memcg)) { > + if (nr_mem) > + res_counter_uncharge(&memcg->res, > + nr_mem * PAGE_SIZE); > + if (nr_memsw) > + res_counter_uncharge(&memcg->memsw, > + nr_memsw * PAGE_SIZE); > + memcg_oom_recover(memcg); > + } > > local_irq_save(flags); > __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); > -- > 2.0.4 > > -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>