Re: [PATCH 12/27] mm, vmscan: Make shrink_node decisions more node-centric

Michal Hocko <mhocko@xxxxxxxxxx> · Wed, 22 Jun 2016 15:20:51 +0200



On Tue 21-06-16 15:15:51, Mel Gorman wrote:
> Earlier patches focused on having direct reclaim and kswapd use data that
> is node-centric for reclaiming but shrink_node() itself still uses too much
> zone information. This patch removes unnecessary zone-based information
> with the most important decision being whether to continue reclaim or
> not. Some memcg APIs are adjusted as a result even though memcg itself
> still uses some zone information.
> 
> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
>  include/linux/memcontrol.h |  9 +++----
>  include/linux/mmzone.h     |  4 ++--
>  include/linux/swap.h       |  2 +-
>  mm/memcontrol.c            | 17 +++++++-------
>  mm/page_alloc.c            |  2 +-
>  mm/vmscan.c                | 58 ++++++++++++++++++++++++++--------------------
>  mm/workingset.c            |  6 ++---
>  7 files changed, 54 insertions(+), 44 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index cda436c79d8c..a13328851fea 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -306,7 +306,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
>  
>  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
>  
> -struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
> +struct lruvec *mem_cgroup_lruvec(struct pglist_data *, struct zone *zone,
> +				 struct mem_cgroup *);
>  struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
>  
>  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
> @@ -573,10 +574,10 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
>  {
>  }
>  
> -static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
> -						    struct mem_cgroup *memcg)
> +static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
> +				struct zone *zone, struct mem_cgroup *memcg)
>  {
> -	return zone_lruvec(zone);
> +	return node_lruvec(pgdat);
>  }
>  
>  static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 890d1858aa22..6991eded0ffd 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -737,9 +737,9 @@ static inline spinlock_t *zone_lru_lock(struct zone *zone)
>  	return &zone->zone_pgdat->lru_lock;
>  }
>  
> -static inline struct lruvec *zone_lruvec(struct zone *zone)
> +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
>  {
> -	return &zone->zone_pgdat->lruvec;
> +	return &pgdat->lruvec;
>  }
>  
>  static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 916e2eddecd6..0ad616d7c381 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -316,7 +316,7 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  						  unsigned long nr_pages,
>  						  gfp_t gfp_mask,
>  						  bool may_swap);
> -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
> +extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
>  						unsigned long *nr_scanned);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 864a4e3a82c1..aac5fae56ea4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -944,22 +944,23 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
>  	     iter = mem_cgroup_iter(NULL, iter, NULL))
>  
>  /**
> - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
> + * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
> + * @node: node of the wanted lruvec
>   * @zone: zone of the wanted lruvec
>   * @memcg: memcg of the wanted lruvec
>   *
> - * Returns the lru list vector holding pages for the given @zone and
> - * @mem.  This can be the global zone lruvec, if the memory controller
> + * Returns the lru list vector holding pages for a given @node or a given
> + * @memcg and @zone. This can be the node lruvec, if the memory controller
>   * is disabled.
>   */
> -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
> -				      struct mem_cgroup *memcg)
> +struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
> +				 struct zone *zone, struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup_per_zone *mz;
>  	struct lruvec *lruvec;
>  
>  	if (mem_cgroup_disabled()) {
> -		lruvec = zone_lruvec(zone);
> +		lruvec = node_lruvec(pgdat);
>  		goto out;
>  	}
>  
> @@ -1474,8 +1475,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
>  			}
>  			continue;
>  		}
> -		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
> -						     zone, &nr_scanned);
> +		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
> +					zone, &nr_scanned);
>  		*total_scanned += nr_scanned;
>  		if (!soft_limit_excess(root_memcg))
>  			break;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e128af8de05f..d62b147fd426 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5897,6 +5897,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
>  #endif
>  	pgdat_page_ext_init(pgdat);
>  	spin_lock_init(&pgdat->lru_lock);
> +	lruvec_init(node_lruvec(pgdat));
>  
>  	for (j = 0; j < MAX_NR_ZONES; j++) {
>  		struct zone *zone = pgdat->node_zones + j;
> @@ -5959,7 +5960,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
>  		/* For bootup, initialized properly in watermark setup */
>  		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
>  
> -		lruvec_init(zone_lruvec(zone));
>  		if (!size)
>  			continue;
>  
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d42a86e603e8..3774ebf19f63 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2220,10 +2220,11 @@ static inline void init_tlb_ubc(void)
>  /*
>   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
>   */
> -static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
> +static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
>  			      struct scan_control *sc, unsigned long *lru_pages)
>  {
> -	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +	struct zone *zone = &pgdat->node_zones[sc->reclaim_idx];
> +	struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg);
>  	unsigned long nr[NR_LRU_LISTS];
>  	unsigned long targets[NR_LRU_LISTS];
>  	unsigned long nr_to_scan;
> @@ -2356,13 +2357,14 @@ static bool in_reclaim_compaction(struct scan_control *sc)
>   * calls try_to_compact_zone() that it will have enough free pages to succeed.
>   * It will give up earlier than that if there is difficulty reclaiming pages.
>   */
> -static inline bool should_continue_reclaim(struct zone *zone,
> +static inline bool should_continue_reclaim(struct pglist_data *pgdat,
>  					unsigned long nr_reclaimed,
>  					unsigned long nr_scanned,
>  					struct scan_control *sc)
>  {
>  	unsigned long pages_for_compaction;
>  	unsigned long inactive_lru_pages;
> +	int z;
>  
>  	/* If not in reclaim/compaction mode, stop */
>  	if (!in_reclaim_compaction(sc))
> @@ -2396,21 +2398,27 @@ static inline bool should_continue_reclaim(struct zone *zone,
>  	 * inactive lists are large enough, continue reclaiming
>  	 */
>  	pages_for_compaction = (2UL << sc->order);
> -	inactive_lru_pages = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE);
> +	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
>  	if (get_nr_swap_pages() > 0)
> -		inactive_lru_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
> +		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
>  	if (sc->nr_reclaimed < pages_for_compaction &&
>  			inactive_lru_pages > pages_for_compaction)
>  		return true;
>  
>  	/* If compaction would go ahead or the allocation would succeed, stop */
> -	switch (compaction_suitable(zone, sc->order, 0, 0)) {
> -	case COMPACT_PARTIAL:
> -	case COMPACT_CONTINUE:
> -		return false;
> -	default:
> -		return true;
> +	for (z = 0; z <= sc->reclaim_idx; z++) {
> +		struct zone *zone = &pgdat->node_zones[z];
> +
> +		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
> +		case COMPACT_PARTIAL:
> +		case COMPACT_CONTINUE:
> +			return false;
> +		default:
> +			/* check next zone */
> +			;
> +		}
>  	}
> +	return true;
>  }
>  
>  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> @@ -2419,15 +2427,14 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	bool reclaimable = false;
> -	struct zone *zone = &pgdat->node_zones[classzone_idx];
>  
>  	do {
>  		struct mem_cgroup *root = sc->target_mem_cgroup;
>  		struct mem_cgroup_reclaim_cookie reclaim = {
> -			.zone = zone,
> +			.zone = &pgdat->node_zones[classzone_idx],
>  			.priority = sc->priority,
>  		};
> -		unsigned long zone_lru_pages = 0;
> +		unsigned long node_lru_pages = 0;
>  		struct mem_cgroup *memcg;
>  
>  		nr_reclaimed = sc->nr_reclaimed;
> @@ -2448,11 +2455,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
>  			reclaimed = sc->nr_reclaimed;
>  			scanned = sc->nr_scanned;
>  
> -			shrink_zone_memcg(zone, memcg, sc, &lru_pages);
> -			zone_lru_pages += lru_pages;
> +			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
> +			node_lru_pages += lru_pages;
>  
>  			if (!global_reclaim(sc) && sc->reclaim_idx == classzone_idx)
> -				shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> +				shrink_slab(sc->gfp_mask, pgdat->node_id,
>  					    memcg, sc->nr_scanned - scanned,
>  					    lru_pages);
>  
> @@ -2464,7 +2471,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
>  			/*
>  			 * Direct reclaim and kswapd have to scan all memory
>  			 * cgroups to fulfill the overall scan target for the
> -			 * zone.
> +			 * node.
>  			 *
>  			 * Limit reclaim, on the other hand, only cares about
>  			 * nr_to_reclaim pages to be reclaimed and it will
> @@ -2483,9 +2490,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
>  		 * the eligible LRU pages were scanned.
>  		 */
>  		if (global_reclaim(sc) && sc->reclaim_idx == classzone_idx)
> -			shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
> +			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
>  				    sc->nr_scanned - nr_scanned,
> -				    zone_lru_pages);
> +				    node_lru_pages);
>  
>  		if (reclaim_state) {
>  			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> @@ -2500,7 +2507,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
>  		if (sc->nr_reclaimed - nr_reclaimed)
>  			reclaimable = true;
>  
> -	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
> +	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
>  					 sc->nr_scanned - nr_scanned, sc));
>  
>  	return reclaimable;
> @@ -2896,7 +2903,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  
>  #ifdef CONFIG_MEMCG
>  
> -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
>  						unsigned long *nr_scanned)
> @@ -2906,6 +2913,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  		.target_mem_cgroup = memcg,
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> +		.reclaim_idx = zone_idx(zone),
>  		.may_swap = !noswap,
>  	};
>  	unsigned long lru_pages;
> @@ -2920,11 +2928,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  	/*
>  	 * NOTE: Although we can get the priority field, using it
>  	 * here is not a good idea, since it limits the pages we can scan.
> -	 * if we don't reclaim here, the shrink_zone from balance_pgdat
> +	 * if we don't reclaim here, the shrink_node from balance_pgdat
>  	 * will pick up pages from other mem cgroup's as well. We hack
>  	 * the priority and make it zero.
>  	 */
> -	shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
> +	shrink_node_memcg(zone->zone_pgdat, memcg, &sc, &lru_pages);
>  
>  	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
>  
> @@ -2982,7 +2990,7 @@ static void age_active_anon(struct pglist_data *pgdat,
>  
>  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
>  	do {
> -		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg);
>  
>  		if (inactive_list_is_low(lruvec, false))
>  			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> diff --git a/mm/workingset.c b/mm/workingset.c
> index c0820e06aaff..2d81ca11317d 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -218,7 +218,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
>  	VM_BUG_ON_PAGE(page_count(page), page);
>  	VM_BUG_ON_PAGE(!PageLocked(page), page);
>  
> -	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +	lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg);
>  	eviction = atomic_long_inc_return(&lruvec->inactive_age);
>  	return pack_shadow(memcgid, zone, eviction);
>  }
> @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
>  		rcu_read_unlock();
>  		return false;
>  	}
> -	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +	lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg);
>  	refault = atomic_long_read(&lruvec->inactive_age);
>  	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
>  	rcu_read_unlock();
> @@ -317,7 +317,7 @@ void workingset_activation(struct page *page)
>  	 */
>  	if (!mem_cgroup_disabled() && !page_memcg(page))
>  		goto out;
> -	lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
> +	lruvec = mem_cgroup_lruvec(page_pgdat(page), page_zone(page), page_memcg(page));
>  	atomic_long_inc(&lruvec->inactive_age);
>  out:
>  	unlock_page_memcg(page);
> -- 
> 2.6.4
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>