Long-tailed direct reclaim latency was seen on high-memory (TBs) machines: MGLRU is better at the 99th percentile but worse at the 99.9th. It turned out the old direct reclaim backoff, which tries to enforce a minimum fairness among all eligible memcgs, over-swapped by about (total_mem>>DEF_PRIORITY)-nr_to_reclaim: /* adjust priority if memcg is offline or the target is met */ if (!mem_cgroup_online(memcg)) priority = 0; else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim) priority = DEF_PRIORITY; else priority = sc->priority; The new backoff, which pulls the plug on swapping once the target is met, trades some fairness for curtailed latency. Specifically, in should_abort_scan(): /* over-swapping can increase allocation latency */ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) return true; The fundamental problem is that the backoff requires a sophisticated model and the previous one was oversimplified. The new one may still be, but at least it can handle a couple more corner cases on top of the above: /* age each memcg once to ensure fairness */ if (max_seq - seq > 1) return true; The NR_FREE_PAGES check at the bottom of should_abort_scan(). Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx> --- mm/vmscan.c | 105 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 35 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f6eab73bdfb9..50764b2d462f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -135,10 +135,9 @@ struct scan_control { unsigned int no_demotion:1; #ifdef CONFIG_LRU_GEN - /* help make better choices when multiple memcgs are available */ + /* help kswapd make better choices among multiple memcgs */ unsigned int memcgs_need_aging:1; - unsigned int memcgs_need_swapping:1; - unsigned int memcgs_avoid_swapping:1; + unsigned long last_reclaimed; #endif /* Allocation order */ @@ -4524,22 +4523,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) VM_WARN_ON_ONCE(!current_is_kswapd()); + sc->last_reclaimed = sc->nr_reclaimed; + /* - * To reduce the chance of going into the aging path or swapping, which - * can be costly, optimistically skip them unless their corresponding - * flags were cleared in the eviction path. This improves the overall - * performance when multiple memcgs are available. + * To reduce the chance of going into the aging path, which can be + * costly, optimistically skip it if the flag below was cleared in the + * eviction path. This improves the overall performance when multiple + * memcgs are available. */ if (!sc->memcgs_need_aging) { sc->memcgs_need_aging = true; - sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping; - sc->memcgs_need_swapping = true; return; } - sc->memcgs_need_swapping = true; - sc->memcgs_avoid_swapping = true; - set_mm_walk(pgdat); memcg = mem_cgroup_iter(NULL, NULL, NULL); @@ -5035,7 +5031,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap sc->nr_reclaimed += reclaimed; - if (type == LRU_GEN_ANON && need_swapping) + if (need_swapping && type == LRU_GEN_ANON) *need_swapping = true; return scanned; @@ -5047,19 +5043,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap * reclaim. */ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap, unsigned long reclaimed, bool *need_aging) + bool can_swap, bool *need_aging) { - int priority; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - if (fatal_signal_pending(current)) { - sc->nr_reclaimed += MIN_LRU_BATCH; - return 0; - } - if (mem_cgroup_below_min(memcg) || (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) return 0; @@ -5068,15 +5058,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * if (!nr_to_scan) return 0; - /* adjust priority if memcg is offline or the target is met */ - if (!mem_cgroup_online(memcg)) - priority = 0; - else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim) - priority = DEF_PRIORITY; - else - priority = sc->priority; - - nr_to_scan >>= priority; + nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; if (!nr_to_scan) return 0; @@ -5084,7 +5066,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * return nr_to_scan; /* skip the aging path at the default priority */ - if (priority == DEF_PRIORITY) + if (sc->priority == DEF_PRIORITY) goto done; /* leave the work to lru_gen_age_node() */ @@ -5097,6 +5079,60 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; } +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, + struct scan_control *sc, bool need_swapping) +{ + int i; + DEFINE_MAX_SEQ(lruvec); + + if (!current_is_kswapd()) { + /* age each memcg once to ensure fairness */ + if (max_seq - seq > 1) + return true; + + /* over-swapping can increase allocation latency */ + if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) + return true; + + /* give this thread a chance to exit and free its memory */ + if (fatal_signal_pending(current)) { + sc->nr_reclaimed += MIN_LRU_BATCH; + return true; + } + + if (cgroup_reclaim(sc)) + return false; + } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) + return false; + + /* keep scanning at low priorities to ensure fairness */ + if (sc->priority > DEF_PRIORITY - 2) + return false; + + /* + * A minimum amount of work was done under global memory pressure. For + * kswapd, it may be overshooting. For direct reclaim, the target isn't + * met, and yet the allocation may still succeed, since kswapd may have + * caught up. In either case, it's better to stop now, and restart if + * necessary. + */ + for (i = 0; i <= sc->reclaim_idx; i++) { + unsigned long wmark; + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + + if (!managed_zone(zone)) + continue; + + wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); + if (wmark > zone_page_state(zone, NR_FREE_PAGES)) + return false; + } + + sc->nr_reclaimed += MIN_LRU_BATCH; + + return true; +} + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; @@ -5104,6 +5140,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc bool need_swapping = false; unsigned long scanned = 0; unsigned long reclaimed = sc->nr_reclaimed; + DEFINE_MAX_SEQ(lruvec); lru_add_drain(); @@ -5123,7 +5160,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc else swappiness = 0; - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging); + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); if (!nr_to_scan) goto done; @@ -5135,17 +5172,15 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc if (scanned >= nr_to_scan) break; - if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping) + if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) break; cond_resched(); } /* see the comment in lru_gen_age_node() */ - if (!need_aging) + if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) sc->memcgs_need_aging = false; - if (!need_swapping) - sc->memcgs_need_swapping = false; done: clear_mm_walk(); -- 2.37.3.968.ga6b4b080e4-goog