On Thu, Nov 14, 2019 at 4:29 PM Shakeel Butt <shakeelb@xxxxxxxxxx> wrote: > > On Thu, Nov 7, 2019 at 12:53 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote: > > > > We split the LRU lists into inactive and an active parts to maximize > > workingset protection while allowing just enough inactive cache space > > to faciltate readahead and writeback for one-off file accesses (e.g. a > > linear scan through a file, or logging); or just enough inactive anon > > to maintain recent reference information when reclaim needs to swap. > > > > With cgroups and their nested LRU lists, we currently don't do this > > correctly. While recursive cgroup reclaim establishes a relative LRU > > order among the pages of all involved cgroups, inactive:active size > > decisions are done on a per-cgroup level. As a result, we'll reclaim a > > cgroup's workingset when it doesn't have cold pages, even when one of > > its siblings has plenty of it that should be reclaimed first. > > > > For example: workload A has 50M worth of hot cache but doesn't do any > > one-off file accesses; meanwhile, parallel workload B scans files and > > rarely accesses the same page twice. > > > > If these workloads were to run in an uncgrouped system, A would be > > protected from the high rate of cache faults from B. But if they were > > put in parallel cgroups for memory accounting purposes, B's fast cache > > fault rate would push out the hot cache pages of A. This is unexpected > > and undesirable - the "scan resistance" of the page cache is broken. > > > > This patch moves inactive:active size balancing decisions to the root > > of reclaim - the same level where the LRU order is established. > > > > It does this by looking at the recursize size of the inactive and the > > active file sets of the cgroup subtree at the beginning of the reclaim > > cycle, and then making a decision - scan or skip active pages - that > > applies throughout the entire run and to every cgroup involved. > > Oh ok, this answer my question on previous patch. The reclaim root > looks at the full tree inactive and active count to make decisions and > thus active list of some descendant cgroup will be protected from the > inactive list of its sibling. > > > > > With that in place, in the test above, the VM will recognize that > > there are plenty of inactive pages in the combined cache set of > > workloads A and B and prefer the one-off cache in B over the hot pages > > in A. The scan resistance of the cache is restored. > > > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> Forgot to add: Reviewed-by: Shakeel Butt <shakeelb@xxxxxxxxxx> > > BTW no love for the anon memory? The whole "workingset" mechanism only > works for file pages. Are there any plans to extend it for anon as > well? > > > --- > > include/linux/mmzone.h | 4 +- > > mm/vmscan.c | 185 ++++++++++++++++++++++++++--------------- > > 2 files changed, 118 insertions(+), 71 deletions(-) > > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index 7a09087e8c77..454a230ad417 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -229,12 +229,12 @@ enum lru_list { > > > > #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) > > > > -static inline int is_file_lru(enum lru_list lru) > > +static inline bool is_file_lru(enum lru_list lru) > > { > > return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); > > } > > > > -static inline int is_active_lru(enum lru_list lru) > > +static inline bool is_active_lru(enum lru_list lru) > > { > > return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); > > } > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index 527617ee9b73..df859b1d583c 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -79,6 +79,13 @@ struct scan_control { > > */ > > struct mem_cgroup *target_mem_cgroup; > > > > + /* Can active pages be deactivated as part of reclaim? */ > > +#define DEACTIVATE_ANON 1 > > +#define DEACTIVATE_FILE 2 > > + unsigned int may_deactivate:2; > > + unsigned int force_deactivate:1; > > + unsigned int skipped_deactivate:1; > > + > > /* Writepage batching in laptop mode; RECLAIM_WRITE */ > > unsigned int may_writepage:1; > > > > @@ -101,6 +108,9 @@ struct scan_control { > > /* One of the zones is ready for compaction */ > > unsigned int compaction_ready:1; > > > > + /* There is easily reclaimable cold cache in the current node */ > > + unsigned int cache_trim_mode:1; > > + > > /* The file pages on the current node are dangerously low */ > > unsigned int file_is_tiny:1; > > > > @@ -2154,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list) > > return nr_reclaimed; > > } > > > > +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, > > + struct lruvec *lruvec, struct scan_control *sc) > > +{ > > + if (is_active_lru(lru)) { > > + if (sc->may_deactivate & (1 << is_file_lru(lru))) > > + shrink_active_list(nr_to_scan, lruvec, sc, lru); > > + else > > + sc->skipped_deactivate = 1; > > + return 0; > > + } > > + > > + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); > > +} > > + > > /* > > * The inactive anon list should be small enough that the VM never has > > * to do too much work. > > @@ -2182,59 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list) > > * 1TB 101 10GB > > * 10TB 320 32GB > > */ > > -static bool inactive_list_is_low(struct lruvec *lruvec, bool file, > > - struct scan_control *sc, bool trace) > > +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) > > { > > - enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; > > - struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > - enum lru_list inactive_lru = file * LRU_FILE; > > + enum lru_list active_lru = inactive_lru + LRU_ACTIVE; > > unsigned long inactive, active; > > unsigned long inactive_ratio; > > - struct lruvec *target_lruvec; > > - unsigned long refaults; > > unsigned long gb; > > > > - inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); > > - active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); > > + inactive = lruvec_page_state(lruvec, inactive_lru); > > + active = lruvec_page_state(lruvec, active_lru); > > > > - /* > > - * When refaults are being observed, it means a new workingset > > - * is being established. Disable active list protection to get > > - * rid of the stale workingset quickly. > > - */ > > - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); > > - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); > > - if (file && target_lruvec->refaults != refaults) { > > - inactive_ratio = 0; > > - } else { > > - gb = (inactive + active) >> (30 - PAGE_SHIFT); > > - if (gb) > > - inactive_ratio = int_sqrt(10 * gb); > > - else > > - inactive_ratio = 1; > > - } > > - > > - if (trace) > > - trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, > > - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, > > - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, > > - inactive_ratio, file); > > + gb = (inactive + active) >> (30 - PAGE_SHIFT); > > + if (gb) > > + inactive_ratio = int_sqrt(10 * gb); > > + else > > + inactive_ratio = 1; > > > > return inactive * inactive_ratio < active; > > } > > > > -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, > > - struct lruvec *lruvec, struct scan_control *sc) > > -{ > > - if (is_active_lru(lru)) { > > - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) > > - shrink_active_list(nr_to_scan, lruvec, sc, lru); > > - return 0; > > - } > > - > > - return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); > > -} > > - > > enum scan_balance { > > SCAN_EQUAL, > > SCAN_FRACT, > > @@ -2296,28 +2286,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, > > > > /* > > * If the system is almost out of file pages, force-scan anon. > > - * But only if there are enough inactive anonymous pages on > > - * the LRU. Otherwise, the small LRU gets thrashed. > > */ > > - if (sc->file_is_tiny && > > - !inactive_list_is_low(lruvec, false, sc, false) && > > - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, > > - sc->reclaim_idx) >> sc->priority) { > > + if (sc->file_is_tiny) { > > scan_balance = SCAN_ANON; > > goto out; > > } > > > > /* > > - * If there is enough inactive page cache, i.e. if the size of the > > - * inactive list is greater than that of the active list *and* the > > - * inactive list actually has some pages to scan on this priority, we > > - * do not reclaim anything from the anonymous working set right now. > > - * Without the second condition we could end up never scanning an > > - * lruvec even if it has plenty of old anonymous pages unless the > > - * system is under heavy pressure. > > + * If there is enough inactive page cache, we do not reclaim > > + * anything from the anonymous working right now. > > */ > > - if (!inactive_list_is_low(lruvec, true, sc, false) && > > - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { > > + if (sc->cache_trim_mode) { > > scan_balance = SCAN_FILE; > > goto out; > > } > > @@ -2582,7 +2561,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) > > * Even if we did not try to evict anon pages at all, we want to > > * rebalance the anon lru active/inactive ratio. > > */ > > - if (total_swap_pages && inactive_list_is_low(lruvec, false, sc, true)) > > + if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) > > shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > sc, LRU_ACTIVE_ANON); > > } > > @@ -2722,6 +2701,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) > > unsigned long nr_reclaimed, nr_scanned; > > struct lruvec *target_lruvec; > > bool reclaimable = false; > > + unsigned long file; > > > > target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); > > > > @@ -2731,6 +2711,44 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) > > nr_reclaimed = sc->nr_reclaimed; > > nr_scanned = sc->nr_scanned; > > > > + /* > > + * Target desirable inactive:active list ratios for the anon > > + * and file LRU lists. > > + */ > > + if (!sc->force_deactivate) { > > + unsigned long refaults; > > + > > + if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) > > + sc->may_deactivate |= DEACTIVATE_ANON; > > + else > > + sc->may_deactivate &= ~DEACTIVATE_ANON; > > + > > + /* > > + * When refaults are being observed, it means a new > > + * workingset is being established. Deactivate to get > > + * rid of any stale active pages quickly. > > + */ > > + refaults = lruvec_page_state(target_lruvec, > > + WORKINGSET_ACTIVATE); > > + if (refaults != target_lruvec->refaults || > > + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) > > + sc->may_deactivate |= DEACTIVATE_FILE; > > + else > > + sc->may_deactivate &= ~DEACTIVATE_FILE; > > + } else > > + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; > > + > > + /* > > + * If we have plenty of inactive file pages that aren't > > + * thrashing, try to reclaim those first before touching > > + * anonymous pages. > > + */ > > + file = lruvec_page_state(target_lruvec, LRU_INACTIVE_FILE); > > + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) > > + sc->cache_trim_mode = 1; > > + else > > + sc->cache_trim_mode = 0; > > + > > /* > > * Prevent the reclaimer from falling into the cache trap: as > > * cache pages start out inactive, every cache fault will tip > > @@ -2741,10 +2759,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) > > * anon pages. Try to detect this based on file LRU size. > > */ > > if (!cgroup_reclaim(sc)) { > > - unsigned long file; > > - unsigned long free; > > - int z; > > unsigned long total_high_wmark = 0; > > + unsigned long free, anon; > > + int z; > > > > free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); > > file = node_page_state(pgdat, NR_ACTIVE_FILE) + > > @@ -2758,7 +2775,17 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) > > total_high_wmark += high_wmark_pages(zone); > > } > > > > - sc->file_is_tiny = file + free <= total_high_wmark; > > + /* > > + * Consider anon: if that's low too, this isn't a > > + * runaway file reclaim problem, but rather just > > + * extreme pressure. Reclaim as per usual then. > > + */ > > + anon = node_page_state(pgdat, NR_INACTIVE_ANON); > > + > > + sc->file_is_tiny = > > + file + free <= total_high_wmark && > > + !(sc->may_deactivate & DEACTIVATE_ANON) && > > + anon >> sc->priority; > > } > > > > shrink_node_memcgs(pgdat, sc); > > @@ -3062,9 +3089,27 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, > > if (sc->compaction_ready) > > return 1; > > > > + /* > > + * We make inactive:active ratio decisions based on the node's > > + * composition of memory, but a restrictive reclaim_idx or a > > + * memory.low cgroup setting can exempt large amounts of > > + * memory from reclaim. Neither of which are very common, so > > + * instead of doing costly eligibility calculations of the > > + * entire cgroup subtree up front, we assume the estimates are > > + * good, and retry with forcible deactivation if that fails. > > + */ > > + if (sc->skipped_deactivate) { > > + sc->priority = initial_priority; > > + sc->force_deactivate = 1; > > + sc->skipped_deactivate = 0; > > + goto retry; > > + } > > + > > Not really an objection but in the worst case this will double the > cost of direct reclaim. > > > /* Untapped cgroup reserves? Don't OOM, retry. */ > > if (sc->memcg_low_skipped) { > > sc->priority = initial_priority; > > + sc->force_deactivate = 0; > > + sc->skipped_deactivate = 0; > > sc->memcg_low_reclaim = 1; > > sc->memcg_low_skipped = 0; > > goto retry; > > @@ -3347,18 +3392,20 @@ static void age_active_anon(struct pglist_data *pgdat, > > struct scan_control *sc) > > { > > struct mem_cgroup *memcg; > > + struct lruvec *lruvec; > > > > if (!total_swap_pages) > > return; > > > > + lruvec = mem_cgroup_lruvec(NULL, pgdat); > > + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) > > + return; > > + > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > > do { > > - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); > > - > > - if (inactive_list_is_low(lruvec, false, sc, true)) > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > - sc, LRU_ACTIVE_ANON); > > - > > + lruvec = mem_cgroup_lruvec(memcg, pgdat); > > + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > + sc, LRU_ACTIVE_ANON); > > memcg = mem_cgroup_iter(NULL, memcg, NULL); > > } while (memcg); > > } > > -- > > 2.24.0 > >