[PATCH v7 mm-unstable] mm: vmscan: retry folios written back while isolated for traditional LRU

Chen Ridong <chenridong@xxxxxxxxxxxxxxx> · Sat, 11 Jan 2025 09:15:04 +0000

From: Chen Ridong <chenridong@xxxxxxxxxx>

As commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
while isolated") mentioned:

  The page reclaim isolates a batch of folios from the tail of one of the
  LRU lists and works on those folios one by one.  For a suitable
  swap-backed folio, if the swap device is async, it queues that folio for
  writeback.  After the page reclaim finishes an entire batch, it puts back
  the folios it queued for writeback to the head of the original LRU list.

  In the meantime, the page writeback flushes the queued folios also by
  batches.  Its batching logic is independent from that of the page
  reclaim. For each of the folios it writes back, the page writeback calls
  folio_rotate_reclaimable() which tries to rotate a folio to the tail.

  folio_rotate_reclaimable() only works for a folio after the page reclaim
  has put it back.  If an async swap device is fast enough, the page
  writeback can finish with that folio while the page reclaim is still
  working on the rest of the batch containing it.  In this case, that folio
  will remain at the head and the page reclaim will not retry it before
  reaching there".

The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
while isolated") only fixed the issue for mglru. However, this issue
also exists in the traditional active/inactive LRU and was found at [1].

It can be reproduced with below steps:

1. Compile with CONFIG_TRANSPARENT_HUGEPAGE=y
2. Mount memcg v1, and create memcg named test_memcg and set
   limit_in_bytes=1G, memsw.limit_in_bytes=2G.
3. Create a 1G swap file, and allocate 1.05G anon memory in test_memcg.

It was found that:

  cat memory.limit_in_bytes
  1073741824
  cat memory.memsw.limit_in_bytes
  2147483648
  cat memory.usage_in_bytes
  1073664000
  cat memory.memsw.usage_in_bytes
  1129840640

  free -h
                total        used        free
  Mem:           31Gi       1.2Gi        28Gi
  Swap:         1.0Gi       1.0Gi       2.0Mi

As shown above, the test_memcg used about 50M swap, but almost 1G swap
memory was used, which means that 900M+ may be wasted because other memcgs
can not use these swap memory.

This issue should be fixed in the same way as mglru. Therefore, the common
logic was extracted to the 'find_folios_written_back' function firstly,
which is then reused in the 'shrink_inactive_list' function. Finally,
retry reclaiming those folios that may have missed the rotation for
traditional LRU.

After change, the same test case. only 54M swap was used.

  cat memory.usage_in_bytes
  1073463296
  cat memory.memsw.usage_in_bytes
  1129828352

  free -h
                total        used        free
  Mem:           31Gi       1.2Gi        28Gi
  Swap:         1.0Gi        54Mi       969Mi

[1] https://lore.kernel.org/linux-kernel/20241010081802.290893-1-chenridong@xxxxxxxxxxxxxxx/
[2] https://lore.kernel.org/linux-kernel/CAGsJ_4zqL8ZHNRZ44o_CC69kE7DBVXvbZfvmQxMGiFqRxqHQdA@xxxxxxxxxxxxxx/
Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx>
---

v6->v7:
 - fix conflict based on mm-unstable.
 - update the commit message(quote from YU's commit message, and add
   improvements after change.)
 - restore 'is_retrying' to 'skip_retry' to keep original semantics.

v6: https://lore.kernel.org/linux-kernel/20241223082004.3759152-1-chenridong@xxxxxxxxxxxxxxx/

 mm/vmscan.c | 114 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 76 insertions(+), 38 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 01dce6f26..6861b6937 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,6 +183,9 @@ struct scan_control {
 	struct reclaim_state reclaim_state;
 };
 
+static inline void find_folios_written_back(struct list_head *list,
+		struct list_head *clean, struct lruvec *lruvec, int type, bool is_retrying);
+
 #ifdef ARCH_HAS_PREFETCHW
 #define prefetchw_prev_lru_folio(_folio, _base, _field)			\
 	do {								\
@@ -1960,14 +1963,18 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 		enum lru_list lru)
 {
 	LIST_HEAD(folio_list);
+	LIST_HEAD(clean_list);
 	unsigned long nr_scanned;
-	unsigned int nr_reclaimed = 0;
+	unsigned int nr_reclaimed, total_reclaimed = 0;
+	unsigned int nr_pageout = 0;
+	unsigned int nr_unqueued_dirty = 0;
 	unsigned long nr_taken;
 	struct reclaim_stat stat;
 	bool file = is_file_lru(lru);
 	enum vm_event_item item;
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	bool stalled = false;
+	bool skip_retry = false;
 
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
 		if (stalled)
@@ -2001,22 +2008,47 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	if (nr_taken == 0)
 		return 0;
 
+retry:
 	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
 
+	sc->nr.dirty += stat.nr_dirty;
+	sc->nr.congested += stat.nr_congested;
+	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+	sc->nr.writeback += stat.nr_writeback;
+	sc->nr.immediate += stat.nr_immediate;
+	total_reclaimed += nr_reclaimed;
+	nr_pageout += stat.nr_pageout;
+	nr_unqueued_dirty += stat.nr_unqueued_dirty;
+
+	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
+
+	find_folios_written_back(&folio_list, &clean_list, lruvec, 0, skip_retry);
+
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
 
 	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
 					stat.nr_demoted);
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	item = PGSTEAL_KSWAPD + reclaimer_offset();
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, nr_reclaimed);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
+
+	if (!list_empty(&clean_list)) {
+		list_splice_init(&clean_list, &folio_list);
+		skip_retry = true;
+		spin_unlock_irq(&lruvec->lru_lock);
+		goto retry;
+	}
+	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&lruvec->lru_lock);
+	sc->nr.taken += nr_taken;
+	if (file)
+		sc->nr.file_taken += nr_taken;
 
-	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+	lru_note_cost(lruvec, file, nr_pageout, nr_scanned - total_reclaimed);
 
 	/*
 	 * If dirty folios are scanned that are not queued for IO, it
@@ -2029,7 +2061,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	 * the flushers simply cannot keep up with the allocation
 	 * rate. Nudge the flusher threads in case they are asleep.
 	 */
-	if (stat.nr_unqueued_dirty == nr_taken) {
+	if (nr_unqueued_dirty == nr_taken) {
 		wakeup_flusher_threads(WB_REASON_VMSCAN);
 		/*
 		 * For cgroupv1 dirty throttling is achieved by waking up
@@ -2044,18 +2076,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 			reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
 	}
 
-	sc->nr.dirty += stat.nr_dirty;
-	sc->nr.congested += stat.nr_congested;
-	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
-	sc->nr.writeback += stat.nr_writeback;
-	sc->nr.immediate += stat.nr_immediate;
-	sc->nr.taken += nr_taken;
-	if (file)
-		sc->nr.file_taken += nr_taken;
-
-	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
-			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
-	return nr_reclaimed;
+	return total_reclaimed;
 }
 
 /*
@@ -4637,8 +4658,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	int reclaimed;
 	LIST_HEAD(list);
 	LIST_HEAD(clean);
-	struct folio *folio;
-	struct folio *next;
 	enum vm_event_item item;
 	struct reclaim_stat stat;
 	struct lru_gen_mm_walk *walk;
@@ -4668,26 +4687,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 			scanned, reclaimed, &stat, sc->priority,
 			type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
 
-	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
-		DEFINE_MIN_SEQ(lruvec);
-
-		if (!folio_evictable(folio)) {
-			list_del(&folio->lru);
-			folio_putback_lru(folio);
-			continue;
-		}
-
-		/* retry folios that may have missed folio_rotate_reclaimable() */
-		if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
-		    !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
-			list_move(&folio->lru, &clean);
-			continue;
-		}
-
-		/* don't add rejected folios to the oldest generation */
-		if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
-			set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
-	}
+	find_folios_written_back(&list, &clean, lruvec, type, skip_retry);
 
 	spin_lock_irq(&lruvec->lru_lock);
 
@@ -5706,6 +5706,44 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
 
 #endif /* CONFIG_LRU_GEN */
 
+/**
+ * find_folios_written_back - Find and move the written back folios to a new list.
+ * @list: filios list
+ * @clean: the written back folios list
+ * @lruvec: the lruvec
+ * @type: LRU_GEN_ANON/LRU_GEN_FILE, only for multi-gen LRU
+ * @skip_retry: whether skip retry.
+ */
+static inline void find_folios_written_back(struct list_head *list,
+		struct list_head *clean, struct lruvec *lruvec, int type, bool skip_retry)
+{
+	struct folio *folio;
+	struct folio *next;
+
+	list_for_each_entry_safe_reverse(folio, next, list, lru) {
+#ifdef CONFIG_LRU_GEN
+		DEFINE_MIN_SEQ(lruvec);
+#endif
+		if (!folio_evictable(folio)) {
+			list_del(&folio->lru);
+			folio_putback_lru(folio);
+			continue;
+		}
+
+		/* retry folios that may have missed folio_rotate_reclaimable() */
+		if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+		    !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+			list_move(&folio->lru, clean);
+			continue;
+		}
+#ifdef CONFIG_LRU_GEN
+		/* don't add rejected folios to the oldest generation */
+		if (lru_gen_enabled() && lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+			set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
+#endif
+	}
+}
+
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
-- 
2.34.1