Re: [patch] mm: memcontrol: support transparent huge pages under pressure

Greg Thelen <gthelen@xxxxxxxxxx> · Mon, 22 Sep 2014 22:52:50 -0700

On Fri, Sep 19 2014, Johannes Weiner wrote:

> In a memcg with even just moderate cache pressure, success rates for
> transparent huge page allocations drop to zero, wasting a lot of
> effort that the allocator puts into assembling these pages.
>
> The reason for this is that the memcg reclaim code was never designed
> for higher-order charges.  It reclaims in small batches until there is
> room for at least one page.  Huge pages charges only succeed when
> these batches add up over a series of huge faults, which is unlikely
> under any significant load involving order-0 allocations in the group.
>
> Remove that loop on the memcg side in favor of passing the actual
> reclaim goal to direct reclaim, which is already set up and optimized
> to meet higher-order goals efficiently.
>
> This brings memcg's THP policy in line with the system policy: if the
> allocator painstakingly assembles a hugepage, memcg will at least make
> an honest effort to charge it.  As a result, transparent hugepage
> allocation rates amid cache activity are drastically improved:
>
>                                       vanilla                 patched
> pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
> pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
> pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
> thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
> thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
>
> [ Note: this may in turn increase memory consumption from internal
>   fragmentation, which is an inherent risk of transparent hugepages.
>   Some setups may have to adjust the memcg limits accordingly to
>   accomodate this - or, if the machine is already packed to capacity,
>   disable the transparent huge page feature. ]

We're using an earlier version of this patch, so I approve of the
general direction.  But I have some feedback.

The memsw aspect of this change seems somewhat separate.  Can it be
split into a different patch?

The memsw aspect of this patch seems to change behavior.  Is this
intended?  If so, a mention of it in the commit log would assuage the
reader.  I'll explain...  Assume a machine with swap enabled and
res.limit==memsw.limit, thus memsw_is_minimum is true.  My understanding
is that memsw.usage represents sum(ram_usage, swap_usage).  So when
memsw_is_minimum=true, then both swap_usage=0 and
memsw.usage==res.usage.  In this condition, if res usage is at limit
then there's no point in swapping because memsw.usage is already
maximal.  Prior to this patch I think the kernel did the right thing,
but not afterwards.

Before this patch:
  if res.usage == res.limit, try_charge() indirectly calls
  try_to_free_mem_cgroup_pages(noswap=true)

After this patch:
  if res.usage == res.limit, try_charge() calls
  try_to_free_mem_cgroup_pages(may_swap=true)

Notice the inverted swap-is-allowed value.

I haven't had time to look at your other outstanding memcg patches.
These comments were made with this patch in isolation.

> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
>  include/linux/swap.h |  6 ++--
>  mm/memcontrol.c      | 86 +++++++++++-----------------------------------------
>  mm/vmscan.c          |  7 +++--
>  3 files changed, 25 insertions(+), 74 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ea4f926e6b9b..37a585beef5c 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
>  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  					gfp_t gfp_mask, nodemask_t *mask);
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
> -extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> -						  gfp_t gfp_mask, bool noswap);
> +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> +						  unsigned long nr_pages,
> +						  gfp_t gfp_mask,
> +						  bool may_swap);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9431024e490c..e2def11f1ec1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -315,9 +315,6 @@ struct mem_cgroup {
>  	/* OOM-Killer disable */
>  	int		oom_kill_disable;
>  
> -	/* set when res.limit == memsw.limit */
> -	bool		memsw_is_minimum;
> -
>  	/* protect arrays of thresholds */
>  	struct mutex thresholds_lock;
>  
> @@ -481,14 +478,6 @@ enum res_type {
>  #define OOM_CONTROL		(0)
>  
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child cgroups
>   * appearing has to hold it as well.
> @@ -1794,42 +1783,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  			 NULL, "Memory cgroup out of memory");
>  }
>  
> -static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
> -					gfp_t gfp_mask,
> -					unsigned long flags)
> -{
> -	unsigned long total = 0;
> -	bool noswap = false;
> -	int loop;
> -
> -	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
> -		noswap = true;
> -	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
> -		noswap = true;
> -
> -	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
> -		if (loop)
> -			drain_all_stock_async(memcg);
> -		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
> -		/*
> -		 * Allow limit shrinkers, which are triggered directly
> -		 * by userspace, to catch signals and stop reclaim
> -		 * after minimal progress, regardless of the margin.
> -		 */
> -		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
> -			break;
> -		if (mem_cgroup_margin(memcg))
> -			break;
> -		/*
> -		 * If nothing was reclaimed after two attempts, there
> -		 * may be no reclaimable pages in this hierarchy.
> -		 */
> -		if (loop && !total)
> -			break;
> -	}
> -	return total;
> -}
> -
>  /**
>   * test_mem_cgroup_node_reclaimable
>   * @memcg: the target memcg
> @@ -2532,8 +2485,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	struct mem_cgroup *mem_over_limit;
>  	struct res_counter *fail_res;
>  	unsigned long nr_reclaimed;
> -	unsigned long flags = 0;
>  	unsigned long long size;
> +	bool may_swap = true;
> +	bool drained = false;
>  	int ret = 0;
>  
>  	if (mem_cgroup_is_root(memcg))
> @@ -2550,7 +2504,7 @@ retry:
>  			goto done_restock;
>  		res_counter_uncharge(&memcg->res, size);
>  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
> -		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> +		may_swap = false;
>  	} else
>  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
>  
> @@ -2576,11 +2530,18 @@ retry:
>  	if (!(gfp_mask & __GFP_WAIT))
>  		goto nomem;
>  
> -	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
> +	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> +						    gfp_mask, may_swap);
>  
>  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
>  		goto retry;
>  
> +	if (!drained) {
> +		drain_all_stock_async(mem_over_limit);
> +		drained = true;
> +		goto retry;
> +	}
> +
>  	if (gfp_mask & __GFP_NORETRY)
>  		goto nomem;
>  	/*
> @@ -3655,19 +3616,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>  			enlarge = 1;
>  
>  		ret = res_counter_set_limit(&memcg->res, val);
> -		if (!ret) {
> -			if (memswlimit == val)
> -				memcg->memsw_is_minimum = true;
> -			else
> -				memcg->memsw_is_minimum = false;
> -		}
>  		mutex_unlock(&set_limit_mutex);
>  
>  		if (!ret)
>  			break;
>  
> -		mem_cgroup_reclaim(memcg, GFP_KERNEL,
> -				   MEM_CGROUP_RECLAIM_SHRINK);
> +		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
> +
>  		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>  		/* Usage is reduced ? */
>  		if (curusage >= oldusage)
> @@ -3714,20 +3669,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>  		if (memswlimit < val)
>  			enlarge = 1;
>  		ret = res_counter_set_limit(&memcg->memsw, val);
> -		if (!ret) {
> -			if (memlimit == val)
> -				memcg->memsw_is_minimum = true;
> -			else
> -				memcg->memsw_is_minimum = false;
> -		}
>  		mutex_unlock(&set_limit_mutex);
>  
>  		if (!ret)
>  			break;
>  
> -		mem_cgroup_reclaim(memcg, GFP_KERNEL,
> -				   MEM_CGROUP_RECLAIM_NOSWAP |
> -				   MEM_CGROUP_RECLAIM_SHRINK);
> +		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
> +
>  		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
>  		/* Usage is reduced ? */
>  		if (curusage >= oldusage)
> @@ -3976,8 +3924,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  		if (signal_pending(current))
>  			return -EINTR;
>  
> -		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
> -						false);
> +		progress = try_to_free_mem_cgroup_pages(memcg, 1,
> +							GFP_KERNEL, true);
>  		if (!progress) {
>  			nr_retries--;
>  			/* maybe some writeback is necessary */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b672e2c6becc..97d31ec17d06 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2759,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  }
>  
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> +					   unsigned long nr_pages,
>  					   gfp_t gfp_mask,
> -					   bool noswap)
> +					   bool may_swap)
>  {
>  	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
>  	int nid;
>  	struct scan_control sc = {
> -		.nr_to_reclaim = SWAP_CLUSTER_MAX,
> +		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>  		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>  				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
>  		.target_mem_cgroup = memcg,
>  		.priority = DEF_PRIORITY,
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> -		.may_swap = !noswap,
> +		.may_swap = may_swap,
>  	};
>  
>  	/*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>