On Sat 14-01-17 11:12:36, Johannes Weiner wrote: > On Tue, Jan 10, 2017 at 01:55:51PM +0100, Michal Hocko wrote: > > From: Michal Hocko <mhocko@xxxxxxxx> > > > > get_scan_count considers the whole node LRU size when > > - doing SCAN_FILE due to many page cache inactive pages > > - calculating the number of pages to scan > > > > in both cases this might lead to unexpected behavior especially on 32b > > systems where we can expect lowmem memory pressure very often. > > The amount of retrofitting zones back into reclaim is disappointing :/ Agreed > > /* > > + * Return the number of pages on the given lru which are eligible for the > > + * given zone_idx > > + */ > > +static unsigned long lruvec_lru_size_eligibe_zones(struct lruvec *lruvec, > > + enum lru_list lru, int zone_idx) > > +{ > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > + unsigned long lru_size; > > + int zid; > > + > > + lru_size = lruvec_lru_size(lruvec, lru); > > + for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { > > + struct zone *zone = &pgdat->node_zones[zid]; > > + unsigned long size; > > + > > + if (!managed_zone(zone)) > > + continue; > > + > > + size = lruvec_zone_lru_size(lruvec, lru, zid); > > + lru_size -= min(size, lru_size); > > + } > > + > > + return lru_size; > > The only other use of lruvec_lru_size() is also in get_scan_count(), > where it decays the LRU pressure balancing ratios. That caller wants > to operate on the entire lruvec. > > Can you instead add the filtering logic to lruvec_lru_size() directly, > and pass MAX_NR_ZONES when operating on the entire lruvec? That would > make the code quite a bit clearer than having 3 different lruvec size > querying functions. OK, fair point. What about this? --- >From 39824aac7504b38f943a80b7d98ec4e87a5607a7 Mon Sep 17 00:00:00 2001 From: Michal Hocko <mhocko@xxxxxxxx> Date: Tue, 27 Dec 2016 16:28:44 +0100 Subject: [PATCH] mm, vmscan: consider eligible zones in get_scan_count get_scan_count considers the whole node LRU size when - doing SCAN_FILE due to many page cache inactive pages - calculating the number of pages to scan in both cases this might lead to unexpected behavior especially on 32b systems where we can expect lowmem memory pressure very often. A large highmem zone can easily distort SCAN_FILE heuristic because there might be only few file pages from the eligible zones on the node lru and we would still enforce file lru scanning which can lead to trashing while we could still scan anonymous pages. The later use of lruvec_lru_size can be problematic as well. Especially when there are not many pages from the eligible zones. We would have to skip over many pages to find anything to reclaim but shrink_node_memcg would only reduce the remaining number to scan by SWAP_CLUSTER_MAX at maximum. Therefore we can end up going over a large LRU many times without actually having chance to reclaim much if anything at all. The closer we are out of memory on lowmem zone the worse the problem will be. Fix this by making lruvec_lru_size zone aware. zone_idx will tell the the maximum eligible zone. Changes since v2 - move the zone filtering logic to lruvec_lru_size so that we do not have too many lruvec_lru_size* functions - Johannes Changes since v1 - s@lruvec_lru_size_zone_idx@lruvec_lru_size_eligibe_zones@ Acked-by: Minchan Kim <minchan@xxxxxxxxxx> Signed-off-by: Michal Hocko <mhocko@xxxxxxxx> --- include/linux/mmzone.h | 2 +- mm/vmscan.c | 54 +++++++++++++++++++++++++++++++++++--------------- mm/workingset.c | 2 +- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d1d440cff60e..91f69aa0d581 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -780,7 +780,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru); +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORY_PRESENT void memory_present(int nid, unsigned long start, unsigned long end); diff --git a/mm/vmscan.c b/mm/vmscan.c index cf940af609fd..46e0d87b78e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -234,22 +234,44 @@ bool pgdat_reclaimable(struct pglist_data *pgdat) pgdat_reclaimable_pages(pgdat) * 6; } -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) +static unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) { if (!mem_cgroup_disabled()) - return mem_cgroup_get_lru_size(lruvec, lru); + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); - return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], + NR_ZONE_LRU_BASE + lru); } -unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) +/** lruvec_lru_size - Returns the number of pages on the given LRU list. + * @lruvec: lru vector + * @lru: lru to use + * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + */ +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { + unsigned long lru_size; + int zid; + if (!mem_cgroup_disabled()) - return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + lru_size = mem_cgroup_get_lru_size(lruvec, lru); + else + lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); + + for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + unsigned long size; + + if (!managed_zone(zone)) + continue; + + size = lruvec_zone_lru_size(lruvec, lru, zid); + lru_size -= min(size, lru_size); + } + + return lru_size; - return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], - NR_ZONE_LRU_BASE + lru); } /* @@ -2064,8 +2086,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!file && !total_swap_pages) return false; - total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE); - total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); + total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE, MAX_NR_ZONES); + total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE, MAX_NR_ZONES); /* * For zone-constrained allocations, it is necessary to check if @@ -2236,7 +2258,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_list_is_low(lruvec, true, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2262,10 +2284,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2303,7 +2325,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, unsigned long size; unsigned long scan; - size = lruvec_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); scan = size >> sc->priority; if (!scan && pass && force_scan) diff --git a/mm/workingset.c b/mm/workingset.c index abb58ffa3c64..a67f5796b995 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow) } lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); rcu_read_unlock(); /* -- 2.11.0 -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>