[PATCH 4/4] mm/filemap: optimize filemap folio adding

Kairui Song <ryncsn@xxxxxxxxx> · Tue, 19 Mar 2024 17:27:33 +0800

From: Kairui Song <kasong@xxxxxxxxxxx>

Instead of doing multiple tree walks, do one optimism range check
with lock hold, and exit if raced with another insertion. If a shadow
exists, check it with a new xas_get_order helper before releasing the
lock to avoid redundant tree walks for getting its order.

Drop the lock and do the allocation only if a split is needed.

In the best case, it only need to walk the tree once. If it needs
to alloc and split, 3 walks are issued (One for first ranced
conflict check and order retrieving, one for the second check after
allocation, one for the insert after split).

Testing with 4k pages, in an 8G cgroup, with 20G brd as block device:

fio -name=cached --numjobs=16 --filename=/mnt/test.img \
  --buffered=1 --ioengine=mmap --rw=randread --time_based \
  --ramp_time=30s --runtime=5m --group_reporting

Before:
bw (  MiB/s): min=  790, max= 3665, per=100.00%, avg=2499.17, stdev=20.64, samples=8698
iops        : min=202295, max=938417, avg=639785.81, stdev=5284.08, samples=8698

After (+4%):
bw (  MiB/s): min=  451, max= 3868, per=100.00%, avg=2599.83, stdev=23.39, samples=8653
iops        : min=115596, max=990364, avg=665556.34, stdev=5988.20, samples=8653

Test result with THP (do a THP randread then switch to 4K page in hope it
issues a lot of splitting):

fio -name=cached --numjobs=16 --filename=/mnt/test.img \
  --buffered=1 --ioengine mmap -thp=1 --readonly \
  --rw=randread --random_distribution=random \
  --time_based --runtime=5m --group_reporting

fio -name=cached --numjobs=16 --filename=/mnt/test.img \
  --buffered=1 --ioengine mmap --readonly \
  --rw=randread --random_distribution=random \
  --time_based --runtime=5s --group_reporting

Before:
bw (  KiB/s): min=28071, max=62359, per=100.00%, avg=53542.44, stdev=179.77, samples=9520
iops        : min= 7012, max=15586, avg=13379.39, stdev=44.94, samples=9520
bw (  MiB/s): min= 2457, max= 6193, per=100.00%, avg=3923.21, stdev=82.48, samples=144
iops        : min=629220, max=1585642, avg=1004340.78, stdev=21116.07, samples=144

After (+-0.0%):
bw (  KiB/s): min=30561, max=63064, per=100.00%, avg=53635.82, stdev=177.21, samples=9520
iops        : min= 7636, max=15762, avg=13402.82, stdev=44.29, samples=9520
bw (  MiB/s): min= 2449, max= 6145, per=100.00%, avg=3914.68, stdev=81.15, samples=144
iops        : min=627106, max=1573156, avg=1002158.11, stdev=20774.77, samples=144

The performance is better (+4%) for 4K cached read and unchanged for THP.

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
 mm/filemap.c | 127 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 51 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 6bbec8783793..c1484bcdbddb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -848,12 +848,77 @@ void replace_page_cache_folio(struct folio *old, struct folio *new)
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_folio);
 
+static int __split_add_folio_locked(struct xa_state *xas, struct folio *folio,
+				    pgoff_t index, gfp_t gfp, void **shadowp)
+{
+	void *entry, *shadow, *alloced_shadow = NULL;
+	int order, alloced_order = 0;
+
+	gfp &= GFP_RECLAIM_MASK;
+	for (;;) {
+		shadow = NULL;
+		order = 0;
+
+		xas_for_each_conflict(xas, entry) {
+			if (!xa_is_value(entry))
+				return -EEXIST;
+			shadow = entry;
+		}
+
+		if (shadow) {
+			if (shadow == xas_reload(xas)) {
+				order = xas_get_order(xas);
+				if (order && order > folio_order(folio)) {
+					/* entry may have been split before we acquired lock */
+					if (shadow != alloced_shadow || order != alloced_order)
+						goto unlock;
+					xas_split(xas, shadow, order);
+					xas_reset(xas);
+				}
+				order = 0;
+			}
+			if (shadowp)
+				*shadowp = shadow;
+		}
+
+		xas_store(xas, folio);
+		/* Success, return with mapping locked */
+		if (!xas_error(xas))
+			return 0;
+unlock:
+		/*
+		 * Unlock path, if errored, return unlocked.
+		 * If allocation needed, alloc and retry.
+		 */
+		xas_unlock_irq(xas);
+		if (order) {
+			if (unlikely(alloced_order))
+				xas_destroy(xas);
+			xas_split_alloc(xas, shadow, order, gfp);
+			if (!xas_error(xas)) {
+				alloced_shadow = shadow;
+				alloced_order = order;
+			}
+			goto next;
+		}
+		/* xas_nomem result checked by xas_error below */
+		xas_nomem(xas, gfp);
+next:
+		xas_lock_irq(xas);
+		if (xas_error(xas))
+			return xas_error(xas);
+
+		xas_reset(xas);
+	}
+}
+
 noinline int __filemap_add_folio(struct address_space *mapping,
 		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
 	bool huge = folio_test_hugetlb(folio);
 	long nr;
+	int ret;
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
@@ -863,70 +928,30 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 	xas_set_order(&xas, index, folio_order(folio));
 	nr = folio_nr_pages(folio);
 
-	gfp &= GFP_RECLAIM_MASK;
 	folio_ref_add(folio, nr);
 	folio->mapping = mapping;
 	folio->index = xas.xa_index;
 
-	do {
-		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
-		void *entry, *old = NULL;
-
-		if (order > folio_order(folio)) {
-			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
-					order, gfp);
-			if (xas_error(&xas))
-				goto error;
-		}
-		xas_lock_irq(&xas);
-		xas_for_each_conflict(&xas, entry) {
-			old = entry;
-			if (!xa_is_value(entry)) {
-				xas_set_err(&xas, -EEXIST);
-				goto unlock;
-			}
-		}
-
-		if (old) {
-			if (shadowp)
-				*shadowp = old;
-			/* entry may have been split before we acquired lock */
-			order = xa_get_order(xas.xa, xas.xa_index);
-			if (order > folio_order(folio)) {
-				/* How to handle large swap entries? */
-				BUG_ON(shmem_mapping(mapping));
-				xas_split(&xas, old, order);
-				xas_reset(&xas);
-			}
-		}
-
-		xas_store(&xas, folio);
-		if (xas_error(&xas))
-			goto unlock;
-
+	xas_lock_irq(&xas);
+	ret = __split_add_folio_locked(&xas, folio, index, gfp, shadowp);
+	if (likely(!ret)) {
 		mapping->nrpages += nr;
-
-		/* hugetlb pages do not participate in page cache accounting */
 		if (!huge) {
 			__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
 			if (folio_test_pmd_mappable(folio))
 				__lruvec_stat_mod_folio(folio,
 						NR_FILE_THPS, nr);
 		}
-unlock:
 		xas_unlock_irq(&xas);
-	} while (xas_nomem(&xas, gfp));
-
-	if (xas_error(&xas))
-		goto error;
+		trace_mm_filemap_add_to_page_cache(folio);
+	} else {
+		xas_unlock_irq(&xas);
+		folio->mapping = NULL;
+		/* Leave page->index set: truncation relies upon it */
+		folio_put_refs(folio, nr);
+	}
 
-	trace_mm_filemap_add_to_page_cache(folio);
-	return 0;
-error:
-	folio->mapping = NULL;
-	/* Leave page->index set: truncation relies upon it */
-	folio_put_refs(folio, nr);
-	return xas_error(&xas);
+	return ret;
 }
 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
 
-- 
2.43.0