Re: [PATCH] mm: thp: move deferred split queue to memcg's nodeinfo

Michal Hocko <mhocko@xxxxxxxxxx> · Wed, 2 Oct 2019 10:43:04 +0200

On Wed 02-10-19 06:16:43, Yang Shi wrote:
> The commit 87eaceb3faa59b9b4d940ec9554ce251325d83fe ("mm: thp: make
> deferred split shrinker memcg aware") makes deferred split queue per
> memcg to resolve memcg pre-mature OOM problem.  But, all nodes end up
> sharing the same queue instead of one queue per-node before the commit.
> It is not a big deal for memcg limit reclaim, but it may cause global
> kswapd shrink THPs from a different node.
> 
> And, 0-day testing reported -19.6% regression of stress-ng's madvise
> test [1].  I didn't see that much regression on my test box (24 threads,
> 48GB memory, 2 nodes), with the same test (stress-ng --timeout 1
> --metrics-brief --sequential 72  --class vm --exclude spawn,exec), I saw
> average -3% (run the same test 10 times then calculate the average since
> the test itself may have most 15% variation according to my test)
> regression sometimes (not every time, sometimes I didn't see regression
> at all).
> 
> This might be caused by deferred split queue lock contention.  With some
> configuration (i.e. just one root memcg) the lock contention my be worse
> than before (given 2 nodes, two locks are reduced to one lock).
> 
> So, moving deferred split queue to memcg's nodeinfo to make it NUMA
> aware again.
> 
> With this change stress-ng's madvise test shows average 4% improvement
> sometimes and I didn't see degradation anymore.

My concern about this getting more and more complex
(http://lkml.kernel.org/r/20191002084014.GH15624@xxxxxxxxxxxxxx) holds
here even more. Can we step back and reconsider the whole thing please?

> [1]: https://lore.kernel.org/lkml/20190930084604.GC17687@shao2-debian/
> 
> Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> Cc: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Shakeel Butt <shakeelb@xxxxxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> Signed-off-by: Yang Shi <yang.shi@xxxxxxxxxxxxxxxxx>
> ---
>  include/linux/memcontrol.h |  8 ++++----
>  mm/huge_memory.c           | 15 +++++++++------
>  mm/memcontrol.c            | 29 +++++++++++++++++------------
>  3 files changed, 30 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9b60863..4b5c791 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -137,6 +137,10 @@ struct mem_cgroup_per_node {
>  	bool			congested;	/* memcg has many dirty pages */
>  						/* backed by a congested BDI */
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	struct deferred_split deferred_split_queue;
> +#endif
> +
>  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
>  						/* use container_of	   */
>  };
> @@ -330,10 +334,6 @@ struct mem_cgroup {
>  	struct list_head event_list;
>  	spinlock_t event_list_lock;
>  
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	struct deferred_split deferred_split_queue;
> -#endif
> -
>  	struct mem_cgroup_per_node *nodeinfo[0];
>  	/* WARNING: nodeinfo must be the last member here */
>  };
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c5cb6dc..3b78910 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -500,10 +500,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>  static inline struct deferred_split *get_deferred_split_queue(struct page *page)
>  {
>  	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
> -	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
> +	int nid = page_to_nid(page);
> +	struct pglist_data *pgdat = NODE_DATA(nid);
>  
>  	if (memcg)
> -		return &memcg->deferred_split_queue;
> +		return &memcg->nodeinfo[nid]->deferred_split_queue;
>  	else
>  		return &pgdat->deferred_split_queue;
>  }
> @@ -2882,12 +2883,13 @@ void deferred_split_huge_page(struct page *page)
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>  		struct shrink_control *sc)
>  {
> -	struct pglist_data *pgdata = NODE_DATA(sc->nid);
> +	int nid = sc->nid;
> +	struct pglist_data *pgdata = NODE_DATA(nid);
>  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>  
>  #ifdef CONFIG_MEMCG
>  	if (sc->memcg)
> -		ds_queue = &sc->memcg->deferred_split_queue;
> +		ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue;
>  #endif
>  	return READ_ONCE(ds_queue->split_queue_len);
>  }
> @@ -2895,7 +2897,8 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
>  static unsigned long deferred_split_scan(struct shrinker *shrink,
>  		struct shrink_control *sc)
>  {
> -	struct pglist_data *pgdata = NODE_DATA(sc->nid);
> +	int nid = sc->nid;
> +	struct pglist_data *pgdata = NODE_DATA(nid);
>  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>  	unsigned long flags;
>  	LIST_HEAD(list), *pos, *next;
> @@ -2904,7 +2907,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  
>  #ifdef CONFIG_MEMCG
>  	if (sc->memcg)
> -		ds_queue = &sc->memcg->deferred_split_queue;
> +		ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue;
>  #endif
>  
>  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c313c49..19d4295 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4989,6 +4989,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  	pn->on_tree = false;
>  	pn->memcg = memcg;
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	spin_lock_init(&pn->deferred_split_queue.split_queue_lock);
> +	INIT_LIST_HEAD(&pn->deferred_split_queue.split_queue);
> +	pn->deferred_split_queue.split_queue_len = 0;
> +#endif
> +
>  	memcg->nodeinfo[node] = pn;
>  	return 0;
>  }
> @@ -5081,11 +5087,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>  		memcg->cgwb_frn[i].done =
>  			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
>  #endif
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> -	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> -	memcg->deferred_split_queue.split_queue_len = 0;
> -#endif
>  	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
>  	return memcg;
>  fail:
> @@ -5419,6 +5420,8 @@ static int mem_cgroup_move_account(struct page *page,
>  	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
>  	int ret;
>  	bool anon;
> +	struct deferred_split *ds_queue;
> +	int nid = page_to_nid(page);
>  
>  	VM_BUG_ON(from == to);
>  	VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -5466,10 +5469,11 @@ static int mem_cgroup_move_account(struct page *page,
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  	if (compound && !list_empty(page_deferred_list(page))) {
> -		spin_lock(&from->deferred_split_queue.split_queue_lock);
> +		ds_queue = &from->nodeinfo[nid]->deferred_split_queue;
> +		spin_lock(&ds_queue->split_queue_lock);
>  		list_del_init(page_deferred_list(page));
> -		from->deferred_split_queue.split_queue_len--;
> -		spin_unlock(&from->deferred_split_queue.split_queue_lock);
> +		ds_queue->split_queue_len--;
> +		spin_unlock(&ds_queue->split_queue_lock);
>  	}
>  #endif
>  	/*
> @@ -5483,11 +5487,12 @@ static int mem_cgroup_move_account(struct page *page,
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  	if (compound && list_empty(page_deferred_list(page))) {
> -		spin_lock(&to->deferred_split_queue.split_queue_lock);
> +		ds_queue = &to->nodeinfo[nid]->deferred_split_queue;
> +		spin_lock(&ds_queue->split_queue_lock);
>  		list_add_tail(page_deferred_list(page),
> -			      &to->deferred_split_queue.split_queue);
> -		to->deferred_split_queue.split_queue_len++;
> -		spin_unlock(&to->deferred_split_queue.split_queue_lock);
> +			      &ds_queue->split_queue);
> +		ds_queue->split_queue_len++;
> +		spin_unlock(&ds_queue->split_queue_lock);
>  	}
>  #endif
>  
> -- 
> 1.8.3.1

-- 
Michal Hocko
SUSE Labs