From: Chen Ridong <chenridong@xxxxxxxxxx> As commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back while isolated") mentioned: The page reclaim isolates a batch of folios from the tail of one of the LRU lists and works on those folios one by one. For a suitable swap-backed folio, if the swap device is async, it queues that folio for writeback. After the page reclaim finishes an entire batch, it puts back the folios it queued for writeback to the head of the original LRU list. In the meantime, the page writeback flushes the queued folios also by batches. Its batching logic is independent from that of the page reclaim. For each of the folios it writes back, the page writeback calls folio_rotate_reclaimable() which tries to rotate a folio to the tail. folio_rotate_reclaimable() only works for a folio after the page reclaim has put it back. If an async swap device is fast enough, the page writeback can finish with that folio while the page reclaim is still working on the rest of the batch containing it. In this case, that folio will remain at the head and the page reclaim will not retry it before reaching there". The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back while isolated") only fixed the issue for mglru. However, this issue also exists in the traditional active/inactive LRU and was found at [1]. It can be reproduced with below steps: 1. Compile with CONFIG_TRANSPARENT_HUGEPAGE=y 2. Mount memcg v1, and create memcg named test_memcg and set limit_in_bytes=1G, memsw.limit_in_bytes=2G. 3. Create a 1G swap file, and allocate 1.05G anon memory in test_memcg. It was found that: cat memory.limit_in_bytes 1073741824 cat memory.memsw.limit_in_bytes 2147483648 cat memory.usage_in_bytes 1073664000 cat memory.memsw.usage_in_bytes 1129840640 free -h total used free Mem: 31Gi 1.2Gi 28Gi Swap: 1.0Gi 1.0Gi 2.0Mi As shown above, the test_memcg used about 50M swap, but almost 1G swap memory was used, which means that 900M+ may be wasted because other memcgs can not use these swap memory. This issue should be fixed in the same way as mglru. Therefore, the common logic was extracted to the 'find_folios_written_back' function firstly, which is then reused in the 'shrink_inactive_list' function. Finally, retry reclaiming those folios that may have missed the rotation for traditional LRU. After change, the same test case. only 54M swap was used. cat memory.usage_in_bytes 1073463296 cat memory.memsw.usage_in_bytes 1129828352 free -h total used free Mem: 31Gi 1.2Gi 28Gi Swap: 1.0Gi 54Mi 969Mi [1] https://lore.kernel.org/linux-kernel/20241010081802.290893-1-chenridong@xxxxxxxxxxxxxxx/ [2] https://lore.kernel.org/linux-kernel/CAGsJ_4zqL8ZHNRZ44o_CC69kE7DBVXvbZfvmQxMGiFqRxqHQdA@xxxxxxxxxxxxxx/ Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx> --- v6->v7: - fix conflict based on mm-unstable. - update the commit message(quote from YU's commit message, and add improvements after change.) - restore 'is_retrying' to 'skip_retry' to keep original semantics. v6: https://lore.kernel.org/linux-kernel/20241223082004.3759152-1-chenridong@xxxxxxxxxxxxxxx/ mm/vmscan.c | 114 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 38 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 01dce6f26..6861b6937 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,6 +183,9 @@ struct scan_control { struct reclaim_state reclaim_state; }; +static inline void find_folios_written_back(struct list_head *list, + struct list_head *clean, struct lruvec *lruvec, int type, bool is_retrying); + #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ @@ -1960,14 +1963,18 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, enum lru_list lru) { LIST_HEAD(folio_list); + LIST_HEAD(clean_list); unsigned long nr_scanned; - unsigned int nr_reclaimed = 0; + unsigned int nr_reclaimed, total_reclaimed = 0; + unsigned int nr_pageout = 0; + unsigned int nr_unqueued_dirty = 0; unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; + bool skip_retry = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) @@ -2001,22 +2008,47 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; +retry: nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + total_reclaimed += nr_reclaimed; + nr_pageout += stat.nr_pageout; + nr_unqueued_dirty += stat.nr_unqueued_dirty; + + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, &stat, sc->priority, file); + + find_folios_written_back(&folio_list, &clean_list, lruvec, 0, skip_retry); + spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), stat.nr_demoted); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); + + if (!list_empty(&clean_list)) { + list_splice_init(&clean_list, &folio_list); + skip_retry = true; + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; - lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); + lru_note_cost(lruvec, file, nr_pageout, nr_scanned - total_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it @@ -2029,7 +2061,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ - if (stat.nr_unqueued_dirty == nr_taken) { + if (nr_unqueued_dirty == nr_taken) { wakeup_flusher_threads(WB_REASON_VMSCAN); /* * For cgroupv1 dirty throttling is achieved by waking up @@ -2044,18 +2076,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } - sc->nr.dirty += stat.nr_dirty; - sc->nr.congested += stat.nr_congested; - sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; - sc->nr.writeback += stat.nr_writeback; - sc->nr.immediate += stat.nr_immediate; - sc->nr.taken += nr_taken; - if (file) - sc->nr.file_taken += nr_taken; - - trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - nr_scanned, nr_reclaimed, &stat, sc->priority, file); - return nr_reclaimed; + return total_reclaimed; } /* @@ -4637,8 +4658,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap int reclaimed; LIST_HEAD(list); LIST_HEAD(clean); - struct folio *folio; - struct folio *next; enum vm_event_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; @@ -4668,26 +4687,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); - list_for_each_entry_safe_reverse(folio, next, &list, lru) { - DEFINE_MIN_SEQ(lruvec); - - if (!folio_evictable(folio)) { - list_del(&folio->lru); - folio_putback_lru(folio); - continue; - } - - /* retry folios that may have missed folio_rotate_reclaimable() */ - if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && - !folio_test_dirty(folio) && !folio_test_writeback(folio)) { - list_move(&folio->lru, &clean); - continue; - } - - /* don't add rejected folios to the oldest generation */ - if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); - } + find_folios_written_back(&list, &clean, lruvec, type, skip_retry); spin_lock_irq(&lruvec->lru_lock); @@ -5706,6 +5706,44 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * #endif /* CONFIG_LRU_GEN */ +/** + * find_folios_written_back - Find and move the written back folios to a new list. + * @list: filios list + * @clean: the written back folios list + * @lruvec: the lruvec + * @type: LRU_GEN_ANON/LRU_GEN_FILE, only for multi-gen LRU + * @skip_retry: whether skip retry. + */ +static inline void find_folios_written_back(struct list_head *list, + struct list_head *clean, struct lruvec *lruvec, int type, bool skip_retry) +{ + struct folio *folio; + struct folio *next; + + list_for_each_entry_safe_reverse(folio, next, list, lru) { +#ifdef CONFIG_LRU_GEN + DEFINE_MIN_SEQ(lruvec); +#endif + if (!folio_evictable(folio)) { + list_del(&folio->lru); + folio_putback_lru(folio); + continue; + } + + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, clean); + continue; + } +#ifdef CONFIG_LRU_GEN + /* don't add rejected folios to the oldest generation */ + if (lru_gen_enabled() && lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); +#endif + } +} + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; -- 2.34.1