On Tue 21-06-16 15:15:51, Mel Gorman wrote: > Earlier patches focused on having direct reclaim and kswapd use data that > is node-centric for reclaiming but shrink_node() itself still uses too much > zone information. This patch removes unnecessary zone-based information > with the most important decision being whether to continue reclaim or > not. Some memcg APIs are adjusted as a result even though memcg itself > still uses some zone information. > > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Acked-by: Michal Hocko <mhocko@xxxxxxxx> > --- > include/linux/memcontrol.h | 9 +++---- > include/linux/mmzone.h | 4 ++-- > include/linux/swap.h | 2 +- > mm/memcontrol.c | 17 +++++++------- > mm/page_alloc.c | 2 +- > mm/vmscan.c | 58 ++++++++++++++++++++++++++-------------------- > mm/workingset.c | 6 ++--- > 7 files changed, 54 insertions(+), 44 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index cda436c79d8c..a13328851fea 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -306,7 +306,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); > > void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); > > -struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); > +struct lruvec *mem_cgroup_lruvec(struct pglist_data *, struct zone *zone, > + struct mem_cgroup *); > struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); > > bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); > @@ -573,10 +574,10 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) > { > } > > -static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, > - struct mem_cgroup *memcg) > +static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > + struct zone *zone, struct mem_cgroup *memcg) > { > - return zone_lruvec(zone); > + return node_lruvec(pgdat); > } > > static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 890d1858aa22..6991eded0ffd 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -737,9 +737,9 @@ static inline spinlock_t *zone_lru_lock(struct zone *zone) > return &zone->zone_pgdat->lru_lock; > } > > -static inline struct lruvec *zone_lruvec(struct zone *zone) > +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) > { > - return &zone->zone_pgdat->lruvec; > + return &pgdat->lruvec; > } > > static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 916e2eddecd6..0ad616d7c381 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -316,7 +316,7 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > unsigned long nr_pages, > gfp_t gfp_mask, > bool may_swap); > -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, > +extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, > gfp_t gfp_mask, bool noswap, > struct zone *zone, > unsigned long *nr_scanned); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 864a4e3a82c1..aac5fae56ea4 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -944,22 +944,23 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) > iter = mem_cgroup_iter(NULL, iter, NULL)) > > /** > - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg > + * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone > + * @node: node of the wanted lruvec > * @zone: zone of the wanted lruvec > * @memcg: memcg of the wanted lruvec > * > - * Returns the lru list vector holding pages for the given @zone and > - * @mem. This can be the global zone lruvec, if the memory controller > + * Returns the lru list vector holding pages for a given @node or a given > + * @memcg and @zone. This can be the node lruvec, if the memory controller > * is disabled. > */ > -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, > - struct mem_cgroup *memcg) > +struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > + struct zone *zone, struct mem_cgroup *memcg) > { > struct mem_cgroup_per_zone *mz; > struct lruvec *lruvec; > > if (mem_cgroup_disabled()) { > - lruvec = zone_lruvec(zone); > + lruvec = node_lruvec(pgdat); > goto out; > } > > @@ -1474,8 +1475,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, > } > continue; > } > - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, > - zone, &nr_scanned); > + total += mem_cgroup_shrink_node(victim, gfp_mask, false, > + zone, &nr_scanned); > *total_scanned += nr_scanned; > if (!soft_limit_excess(root_memcg)) > break; > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index e128af8de05f..d62b147fd426 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5897,6 +5897,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) > #endif > pgdat_page_ext_init(pgdat); > spin_lock_init(&pgdat->lru_lock); > + lruvec_init(node_lruvec(pgdat)); > > for (j = 0; j < MAX_NR_ZONES; j++) { > struct zone *zone = pgdat->node_zones + j; > @@ -5959,7 +5960,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) > /* For bootup, initialized properly in watermark setup */ > mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); > > - lruvec_init(zone_lruvec(zone)); > if (!size) > continue; > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index d42a86e603e8..3774ebf19f63 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2220,10 +2220,11 @@ static inline void init_tlb_ubc(void) > /* > * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. > */ > -static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, > +static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, > struct scan_control *sc, unsigned long *lru_pages) > { > - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); > + struct zone *zone = &pgdat->node_zones[sc->reclaim_idx]; > + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg); > unsigned long nr[NR_LRU_LISTS]; > unsigned long targets[NR_LRU_LISTS]; > unsigned long nr_to_scan; > @@ -2356,13 +2357,14 @@ static bool in_reclaim_compaction(struct scan_control *sc) > * calls try_to_compact_zone() that it will have enough free pages to succeed. > * It will give up earlier than that if there is difficulty reclaiming pages. > */ > -static inline bool should_continue_reclaim(struct zone *zone, > +static inline bool should_continue_reclaim(struct pglist_data *pgdat, > unsigned long nr_reclaimed, > unsigned long nr_scanned, > struct scan_control *sc) > { > unsigned long pages_for_compaction; > unsigned long inactive_lru_pages; > + int z; > > /* If not in reclaim/compaction mode, stop */ > if (!in_reclaim_compaction(sc)) > @@ -2396,21 +2398,27 @@ static inline bool should_continue_reclaim(struct zone *zone, > * inactive lists are large enough, continue reclaiming > */ > pages_for_compaction = (2UL << sc->order); > - inactive_lru_pages = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE); > + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); > if (get_nr_swap_pages() > 0) > - inactive_lru_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); > + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); > if (sc->nr_reclaimed < pages_for_compaction && > inactive_lru_pages > pages_for_compaction) > return true; > > /* If compaction would go ahead or the allocation would succeed, stop */ > - switch (compaction_suitable(zone, sc->order, 0, 0)) { > - case COMPACT_PARTIAL: > - case COMPACT_CONTINUE: > - return false; > - default: > - return true; > + for (z = 0; z <= sc->reclaim_idx; z++) { > + struct zone *zone = &pgdat->node_zones[z]; > + > + switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { > + case COMPACT_PARTIAL: > + case COMPACT_CONTINUE: > + return false; > + default: > + /* check next zone */ > + ; > + } > } > + return true; > } > > static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > @@ -2419,15 +2427,14 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > struct reclaim_state *reclaim_state = current->reclaim_state; > unsigned long nr_reclaimed, nr_scanned; > bool reclaimable = false; > - struct zone *zone = &pgdat->node_zones[classzone_idx]; > > do { > struct mem_cgroup *root = sc->target_mem_cgroup; > struct mem_cgroup_reclaim_cookie reclaim = { > - .zone = zone, > + .zone = &pgdat->node_zones[classzone_idx], > .priority = sc->priority, > }; > - unsigned long zone_lru_pages = 0; > + unsigned long node_lru_pages = 0; > struct mem_cgroup *memcg; > > nr_reclaimed = sc->nr_reclaimed; > @@ -2448,11 +2455,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > reclaimed = sc->nr_reclaimed; > scanned = sc->nr_scanned; > > - shrink_zone_memcg(zone, memcg, sc, &lru_pages); > - zone_lru_pages += lru_pages; > + shrink_node_memcg(pgdat, memcg, sc, &lru_pages); > + node_lru_pages += lru_pages; > > if (!global_reclaim(sc) && sc->reclaim_idx == classzone_idx) > - shrink_slab(sc->gfp_mask, zone_to_nid(zone), > + shrink_slab(sc->gfp_mask, pgdat->node_id, > memcg, sc->nr_scanned - scanned, > lru_pages); > > @@ -2464,7 +2471,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > /* > * Direct reclaim and kswapd have to scan all memory > * cgroups to fulfill the overall scan target for the > - * zone. > + * node. > * > * Limit reclaim, on the other hand, only cares about > * nr_to_reclaim pages to be reclaimed and it will > @@ -2483,9 +2490,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > * the eligible LRU pages were scanned. > */ > if (global_reclaim(sc) && sc->reclaim_idx == classzone_idx) > - shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, > + shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, > sc->nr_scanned - nr_scanned, > - zone_lru_pages); > + node_lru_pages); > > if (reclaim_state) { > sc->nr_reclaimed += reclaim_state->reclaimed_slab; > @@ -2500,7 +2507,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > if (sc->nr_reclaimed - nr_reclaimed) > reclaimable = true; > > - } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, > + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, > sc->nr_scanned - nr_scanned, sc)); > > return reclaimable; > @@ -2896,7 +2903,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, > > #ifdef CONFIG_MEMCG > > -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, > +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, > gfp_t gfp_mask, bool noswap, > struct zone *zone, > unsigned long *nr_scanned) > @@ -2906,6 +2913,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, > .target_mem_cgroup = memcg, > .may_writepage = !laptop_mode, > .may_unmap = 1, > + .reclaim_idx = zone_idx(zone), > .may_swap = !noswap, > }; > unsigned long lru_pages; > @@ -2920,11 +2928,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, > /* > * NOTE: Although we can get the priority field, using it > * here is not a good idea, since it limits the pages we can scan. > - * if we don't reclaim here, the shrink_zone from balance_pgdat > + * if we don't reclaim here, the shrink_node from balance_pgdat > * will pick up pages from other mem cgroup's as well. We hack > * the priority and make it zero. > */ > - shrink_zone_memcg(zone, memcg, &sc, &lru_pages); > + shrink_node_memcg(zone->zone_pgdat, memcg, &sc, &lru_pages); > > trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); > > @@ -2982,7 +2990,7 @@ static void age_active_anon(struct pglist_data *pgdat, > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > do { > - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); > + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg); > > if (inactive_list_is_low(lruvec, false)) > shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > diff --git a/mm/workingset.c b/mm/workingset.c > index c0820e06aaff..2d81ca11317d 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -218,7 +218,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) > VM_BUG_ON_PAGE(page_count(page), page); > VM_BUG_ON_PAGE(!PageLocked(page), page); > > - lruvec = mem_cgroup_zone_lruvec(zone, memcg); > + lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg); > eviction = atomic_long_inc_return(&lruvec->inactive_age); > return pack_shadow(memcgid, zone, eviction); > } > @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow) > rcu_read_unlock(); > return false; > } > - lruvec = mem_cgroup_zone_lruvec(zone, memcg); > + lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg); > refault = atomic_long_read(&lruvec->inactive_age); > active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); > rcu_read_unlock(); > @@ -317,7 +317,7 @@ void workingset_activation(struct page *page) > */ > if (!mem_cgroup_disabled() && !page_memcg(page)) > goto out; > - lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); > + lruvec = mem_cgroup_lruvec(page_pgdat(page), page_zone(page), page_memcg(page)); > atomic_long_inc(&lruvec->inactive_age); > out: > unlock_page_memcg(page); > -- > 2.6.4 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>