Re: [patch 06/10] mm: memcontrol: use root_mem_cgroup res_counter

Michal Hocko <mhocko@xxxxxxx> · Tue, 3 Jun 2014 16:11:37 +0200

On Thu 29-05-14 12:15:58, Johannes Weiner wrote:
> Due to an old optimization to keep expensive res_counter changes at a
> minimum, the root_mem_cgroup res_counter is never charged; there is no
> limit at that level anyway, and any statistics can be generated on
> demand by summing up the counters of all other cgroups.
> 
> However, with per-cpu charge caches, res_counter operations do not
> even show up in profiles anymore, so this optimization is no longer
> necessary.
> 
> Remove it to simplify the code.
> 
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

OK, I like the resulting code much more.

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
>  mm/memcontrol.c | 149 ++++++++++++++++----------------------------------------
>  1 file changed, 43 insertions(+), 106 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 184e67cce4e4..84195a80068c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2573,9 +2573,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
>  	unsigned long nr_reclaimed;
>  	unsigned long flags = 0;
>  	unsigned long long size;
> +	int ret = 0;
>  
> -	if (mem_cgroup_is_root(memcg))
> -		goto done;
>  retry:
>  	if (consume_stock(memcg, nr_pages))
>  		goto done;
> @@ -2656,13 +2655,15 @@ nomem:
>  	if (!(gfp_mask & __GFP_NOFAIL))
>  		return -ENOMEM;
>  bypass:
> -	return -EINTR;
> +	memcg = root_mem_cgroup;
> +	ret = -EINTR;
> +	goto retry;
>  
>  done_restock:
>  	if (batch > nr_pages)
>  		refill_stock(memcg, batch - nr_pages);
>  done:
> -	return 0;
> +	return ret;
>  }
>  
>  /**
> @@ -2702,13 +2703,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
>  static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
>  				       unsigned int nr_pages)
>  {
> -	if (!mem_cgroup_is_root(memcg)) {
> -		unsigned long bytes = nr_pages * PAGE_SIZE;
> +	unsigned long bytes = nr_pages * PAGE_SIZE;
>  
> -		res_counter_uncharge(&memcg->res, bytes);
> -		if (do_swap_account)
> -			res_counter_uncharge(&memcg->memsw, bytes);
> -	}
> +	res_counter_uncharge(&memcg->res, bytes);
> +	if (do_swap_account)
> +		res_counter_uncharge(&memcg->memsw, bytes);
>  }
>  
>  /*
> @@ -2720,9 +2719,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
>  {
>  	unsigned long bytes = nr_pages * PAGE_SIZE;
>  
> -	if (mem_cgroup_is_root(memcg))
> -		return;
> -
>  	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
>  	if (do_swap_account)
>  		res_counter_uncharge_until(&memcg->memsw,
> @@ -3997,7 +3993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
>  	 * replacement page, so leave it alone when phasing out the
>  	 * page that is unused after the migration.
>  	 */
> -	if (!end_migration && !mem_cgroup_is_root(memcg))
> +	if (!end_migration)
>  		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
>  
>  	return memcg;
> @@ -4130,8 +4126,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
>  		 * We uncharge this because swap is freed.  This memcg can
>  		 * be obsolete one. We avoid calling css_tryget_online().
>  		 */
> -		if (!mem_cgroup_is_root(memcg))
> -			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> +		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
>  		mem_cgroup_swap_statistics(memcg, false);
>  		css_put(&memcg->css);
>  	}
> @@ -4825,78 +4820,24 @@ out:
>  	return retval;
>  }
>  
> -
> -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
> -					       enum mem_cgroup_stat_index idx)
> -{
> -	struct mem_cgroup *iter;
> -	long val = 0;
> -
> -	/* Per-cpu values can be negative, use a signed accumulator */
> -	for_each_mem_cgroup_tree(iter, memcg)
> -		val += mem_cgroup_read_stat(iter, idx);
> -
> -	if (val < 0) /* race ? */
> -		val = 0;
> -	return val;
> -}
> -
> -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
> -{
> -	u64 val;
> -
> -	if (!mem_cgroup_is_root(memcg)) {
> -		if (!swap)
> -			return res_counter_read_u64(&memcg->res, RES_USAGE);
> -		else
> -			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
> -	}
> -
> -	/*
> -	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
> -	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
> -	 */
> -	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
> -	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
> -
> -	if (swap)
> -		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
> -
> -	return val << PAGE_SHIFT;
> -}
> -
>  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
> -				   struct cftype *cft)
> +			       struct cftype *cft)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> -	u64 val;
> -	int name;
> -	enum res_type type;
> -
> -	type = MEMFILE_TYPE(cft->private);
> -	name = MEMFILE_ATTR(cft->private);
> +	enum res_type type = MEMFILE_TYPE(cft->private);
> +	int name = MEMFILE_ATTR(cft->private);
>  
>  	switch (type) {
>  	case _MEM:
> -		if (name == RES_USAGE)
> -			val = mem_cgroup_usage(memcg, false);
> -		else
> -			val = res_counter_read_u64(&memcg->res, name);
> -		break;
> +		return res_counter_read_u64(&memcg->res, name);
>  	case _MEMSWAP:
> -		if (name == RES_USAGE)
> -			val = mem_cgroup_usage(memcg, true);
> -		else
> -			val = res_counter_read_u64(&memcg->memsw, name);
> -		break;
> +		return res_counter_read_u64(&memcg->memsw, name);
>  	case _KMEM:
> -		val = res_counter_read_u64(&memcg->kmem, name);
> +		return res_counter_read_u64(&memcg->kmem, name);
>  		break;
>  	default:
>  		BUG();
>  	}
> -
> -	return val;
>  }
>  
>  #ifdef CONFIG_MEMCG_KMEM
> @@ -5376,7 +5317,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
>  	if (!t)
>  		goto unlock;
>  
> -	usage = mem_cgroup_usage(memcg, swap);
> +	if (!swap)
> +		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> +	else
> +		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
>  
>  	/*
>  	 * current_threshold points to threshold just below or equal to usage.
> @@ -5468,15 +5412,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
>  
>  	mutex_lock(&memcg->thresholds_lock);
>  
> -	if (type == _MEM)
> +	if (type == _MEM) {
>  		thresholds = &memcg->thresholds;
> -	else if (type == _MEMSWAP)
> +		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> +	} else if (type == _MEMSWAP) {
>  		thresholds = &memcg->memsw_thresholds;
> -	else
> +		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
> +	} else
>  		BUG();
>  
> -	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
> -
>  	/* Check if a threshold crossed before adding a new one */
>  	if (thresholds->primary)
>  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> @@ -5556,18 +5500,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
>  	int i, j, size;
>  
>  	mutex_lock(&memcg->thresholds_lock);
> -	if (type == _MEM)
> +
> +	if (type == _MEM) {
>  		thresholds = &memcg->thresholds;
> -	else if (type == _MEMSWAP)
> +		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> +	} else if (type == _MEMSWAP) {
>  		thresholds = &memcg->memsw_thresholds;
> -	else
> +		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
> +	} else
>  		BUG();
>  
>  	if (!thresholds->primary)
>  		goto unlock;
>  
> -	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
> -
>  	/* Check if a threshold crossed before removing */
>  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
>  
> @@ -6329,9 +6274,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
>  		 * core guarantees its existence.
>  		 */
>  	} else {
> -		res_counter_init(&memcg->res, NULL);
> -		res_counter_init(&memcg->memsw, NULL);
> -		res_counter_init(&memcg->kmem, NULL);
> +		res_counter_init(&memcg->res, &root_mem_cgroup->res);
> +		res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
> +		res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
>  		/*
>  		 * Deeper hierachy with use_hierarchy == false doesn't make
>  		 * much sense so let cgroup subsystem know about this
> @@ -6449,11 +6394,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
>  	int batch_count = PRECHARGE_COUNT_AT_ONCE;
>  	struct mem_cgroup *memcg = mc.to;
>  
> -	if (mem_cgroup_is_root(memcg)) {
> -		mc.precharge += count;
> -		/* we don't need css_get for root */
> -		return ret;
> -	}
>  	/* try to charge at once */
>  	if (count > 1) {
>  		struct res_counter *dummy;
> @@ -6769,21 +6709,18 @@ static void __mem_cgroup_clear_mc(void)
>  	/* we must fixup refcnts and charges */
>  	if (mc.moved_swap) {
>  		/* uncharge swap account from the old cgroup */
> -		if (!mem_cgroup_is_root(mc.from))
> -			res_counter_uncharge(&mc.from->memsw,
> -						PAGE_SIZE * mc.moved_swap);
> +		res_counter_uncharge(&mc.from->memsw,
> +				     PAGE_SIZE * mc.moved_swap);
>  
>  		for (i = 0; i < mc.moved_swap; i++)
>  			css_put(&mc.from->css);
>  
> -		if (!mem_cgroup_is_root(mc.to)) {
> -			/*
> -			 * we charged both to->res and to->memsw, so we should
> -			 * uncharge to->res.
> -			 */
> -			res_counter_uncharge(&mc.to->res,
> -						PAGE_SIZE * mc.moved_swap);
> -		}
> +		/*
> +		 * we charged both to->res and to->memsw, so we should
> +		 * uncharge to->res.
> +		 */
> +		res_counter_uncharge(&mc.to->res,
> +				     PAGE_SIZE * mc.moved_swap);
>  		/* we've already done css_get(mc.to) */
>  		mc.moved_swap = 0;
>  	}
> -- 
> 1.9.3
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>