From: liuhailong <liuhailong@xxxxxxxx> If the allocation flag isn't movable, commit 5da226dbfce3 ("mm: skip CMA pages when they are not available") skips CMA during direct reclamation. This generally speeds up the reclamation of non-movable folios. However, in scenarios with limited system memory and a majority of reclaimable folios in LRU being from CMA, this can result in prolonged idle loops where the system reclaims nothing but consumes CPU. I traced the process of a thread entering direct reclamation, and printed the relevant information of the sc(scan_control). __alloc_pages_direct_reclaim start sc->priority:9 sc->nr_skipped_cma:32208 sc->nr_scanned:36 sc->nr_reclaimed:3 sc->priority:8 sc->nr_skipped_cma:32199 sc->nr_scanned:69 sc->nr_reclaimed:3 sc->priority:7 sc->nr_skipped_cma:198405 sc->nr_scanned:121 sc->nr_reclaimed:3 sc->priority:6 sc->nr_skipped_cma:236713 sc->nr_scanned:147 sc->nr_reclaimed:3 sc->priority:5 sc->nr_skipped_cma:708209 sc->nr_scanned:379 sc->nr_reclaimed:3 sc->priority:4 sc->nr_skipped_cma:785537 sc->nr_scanned:646 sc->nr_reclaimed:3 __alloc_pages_direct_reclaim end duration 3356ms Continuously skipping CMA even when the LRU is filled with CMA folios can also result in lmkd failing to terminate processes. The duration of psi_memstall (measured from the exit to the entry of __alloc_pages_direct_reclaim) becomes excessively long, lasting for example a couple of seconds. Consequently, lmkd fails to awaken and terminate processes promptly. This patch introduces no_skip_cma and sets it to true when the number of skipped CMA folios is excessively high. It offers two benefits: Rather than wasting time in idle loops, it's better to assist other threads in reclaiming some folios; This shortens the duration of psi_memstall and ensures timely activation of lmkd within a few milliseconds. Signed-off-by: liuhailong <liuhailong@xxxxxxxx> --- mm/vmscan.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index fa321c125099..2c74c1c94d88 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -114,6 +114,9 @@ struct scan_control { /* Proactive reclaim invoked by userspace through memory.reclaim */ unsigned int proactive:1; + /* Can reclaim skip cma pages */ + unsigned int no_skip_cma:1; + /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at @@ -157,6 +160,9 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* Number of cma-pages skipped so far during a call to shrink_zones() */ + unsigned long nr_skipped_cma; + struct { unsigned int dirty; unsigned int unqueued_dirty; @@ -1572,9 +1578,13 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, */ static bool skip_cma(struct folio *folio, struct scan_control *sc) { - return !current_is_kswapd() && + bool ret = !current_is_kswapd() && !sc->no_skip_cma && gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && folio_migratetype(folio) == MIGRATE_CMA; + + if (ret) + sc->nr_skipped_cma += folio_nr_pages(folio); + return ret; } #else static bool skip_cma(struct folio *folio, struct scan_control *sc) @@ -6188,6 +6198,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; + sc->nr_skipped_cma = 0; shrink_zones(zonelist, sc); if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -6202,6 +6213,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; + + /* + * If we're getting trouble reclaiming non-cma pages and + * currently a substantial number of CMA pages on LRU, + * start reclaiming cma pages to alleviate other threads + * and decrease lru size. + */ + if (sc->priority < DEF_PRIORITY - 2 && + sc->nr_scanned < (sc->nr_skipped_cma >> 3)) + sc->no_skip_cma = 1; } while (--sc->priority >= 0); last_pgdat = NULL; -- 2.36.1