[PATCH 8/9] xfs: use vmalloc for multi-folio buffers

Dave Chinner <david@xxxxxxxxxxxxx> · Tue, 19 Mar 2024 09:45:59 +1100

From: Christoph Hellwig <hch@xxxxxx>

Instead of allocating the folios manually using the bulk page
allocator and then using vm_map_page just use vmalloc to allocate
the entire buffer - vmalloc will use the bulk allocator internally
if it fits.

With this the b_folios array can go away as well as nothing uses it.

[dchinner: port to folio based buffers.]

Signed-off-by: Christoph Hellwig <hch@xxxxxx>
Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
---
 fs/xfs/xfs_buf.c     | 164 ++++++++++++-------------------------------
 fs/xfs/xfs_buf.h     |   2 -
 fs/xfs/xfs_buf_mem.c |   9 +--
 3 files changed, 45 insertions(+), 130 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 303945554415..6d6bad80722e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -282,29 +282,6 @@ _xfs_buf_alloc(
 	return 0;
 }
 
-static void
-xfs_buf_free_folios(
-	struct xfs_buf	*bp)
-{
-	uint		i;
-
-	ASSERT(bp->b_flags & _XBF_FOLIOS);
-
-	if (xfs_buf_is_vmapped(bp))
-		vm_unmap_ram(bp->b_addr, bp->b_folio_count);
-
-	for (i = 0; i < bp->b_folio_count; i++) {
-		if (bp->b_folios[i])
-			__folio_put(bp->b_folios[i]);
-	}
-	mm_account_reclaimed_pages(bp->b_folio_count);
-
-	if (bp->b_folios != bp->b_folio_array)
-		kfree(bp->b_folios);
-	bp->b_folios = NULL;
-	bp->b_flags &= ~_XBF_FOLIOS;
-}
-
 static void
 xfs_buf_free_callback(
 	struct callback_head	*cb)
@@ -323,13 +300,22 @@ xfs_buf_free(
 
 	ASSERT(list_empty(&bp->b_lru));
 
-	if (xfs_buftarg_is_mem(bp->b_target))
+	if (xfs_buftarg_is_mem(bp->b_target)) {
 		xmbuf_unmap_folio(bp);
-	else if (bp->b_flags & _XBF_FOLIOS)
-		xfs_buf_free_folios(bp);
-	else if (bp->b_flags & _XBF_KMEM)
-		kfree(bp->b_addr);
+		goto free;
+	}
 
+	if (!(bp->b_flags & _XBF_KMEM))
+		mm_account_reclaimed_pages(bp->b_folio_count);
+
+	if (bp->b_flags & _XBF_FOLIOS)
+		__folio_put(kmem_to_folio(bp->b_addr));
+	else
+		kvfree(bp->b_addr);
+
+	bp->b_flags &= _XBF_KMEM | _XBF_FOLIOS;
+
+free:
 	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
 
@@ -356,8 +342,6 @@ xfs_buf_alloc_kmem(
 		bp->b_addr = NULL;
 		return -ENOMEM;
 	}
-	bp->b_folios = bp->b_folio_array;
-	bp->b_folios[0] = kmem_to_folio(bp->b_addr);
 	bp->b_folio_count = 1;
 	bp->b_flags |= _XBF_KMEM;
 	return 0;
@@ -377,14 +361,15 @@ xfs_buf_alloc_folio(
 	struct xfs_buf	*bp,
 	gfp_t		gfp_mask)
 {
+	struct folio	*folio;
 	int		length = BBTOB(bp->b_length);
 	int		order = get_order(length);
 
-	bp->b_folio_array[0] = folio_alloc(gfp_mask, order);
-	if (!bp->b_folio_array[0])
+	folio = folio_alloc(gfp_mask, order);
+	if (!folio)
 		return false;
 
-	bp->b_folios = bp->b_folio_array;
+	bp->b_addr = folio_address(folio);
 	bp->b_folio_count = 1;
 	bp->b_flags |= _XBF_FOLIOS;
 	return true;
@@ -400,15 +385,11 @@ xfs_buf_alloc_folio(
  * contiguous memory region that we don't have to map and unmap to access the
  * data directly.
  *
- * The second type of buffer is the multi-folio buffer. These are *always* made
- * up of single page folios so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through.
- *
- * We don't use high order folios for this second type of buffer (yet) because
- * having variable size folios makes offset-to-folio indexing and iteration of
- * the data range more complex than if they are fixed size. This case should now
- * be the slow path, though, so unless we regularly fail to allocate high order
- * folios, there should be little need to optimise this path.
+ * The second type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages. vmalloc() typically doesn't fail, but it can and so we may
+ * need to wrap the allocation in a loop to prevent low memory failures and
+ * shutdowns.
  */
 static int
 xfs_buf_alloc_folios(
@@ -416,7 +397,7 @@ xfs_buf_alloc_folios(
 	xfs_buf_flags_t	flags)
 {
 	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
-	long		filled = 0;
+	unsigned	nofs_flag;
 
 	if (flags & XBF_READ_AHEAD)
 		gfp_mask |= __GFP_NORETRY;
@@ -425,89 +406,32 @@ xfs_buf_alloc_folios(
 	if (!(flags & XBF_READ))
 		gfp_mask |= __GFP_ZERO;
 
-	/* Optimistically attempt a single high order folio allocation. */
-	if (xfs_buf_alloc_folio(bp, gfp_mask))
-		return 0;
-
 	/* Fall back to allocating an array of single page folios. */
 	bp->b_folio_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
-	if (bp->b_folio_count <= XB_FOLIOS) {
-		bp->b_folios = bp->b_folio_array;
-	} else {
-		bp->b_folios = kzalloc(sizeof(struct folio *) * bp->b_folio_count,
-					gfp_mask);
-		if (!bp->b_folios)
-			return -ENOMEM;
-	}
-	bp->b_flags |= _XBF_FOLIOS;
 
+	/* Optimistically attempt a single high order folio allocation. */
+	if (xfs_buf_alloc_folio(bp, gfp_mask))
+		return 0;
+
+	/* We are done if an order-0 allocation has already failed. */
+	if (bp->b_folio_count == 1)
+		return -ENOMEM;
 
 	/*
-	 * Bulk filling of pages can take multiple calls. Not filling the entire
-	 * array is not an allocation failure, so don't back off if we get at
-	 * least one extra page.
+	 * XXX(dgc): I think dquot reclaim is the only place we can get
+	 * to this function from memory reclaim context now. If we fix
+	 * that like we've fixed inode reclaim to avoid writeback from
+	 * reclaim, this nofs wrapping can go away.
 	 */
-	for (;;) {
-		long	last = filled;
-
-		filled = alloc_pages_bulk_array(gfp_mask, bp->b_folio_count,
-						(struct page **)bp->b_folios);
-		if (filled == bp->b_folio_count) {
-			XFS_STATS_INC(bp->b_mount, xb_page_found);
-			break;
-		}
-
-		if (filled != last)
-			continue;
-
-		if (flags & XBF_READ_AHEAD) {
-			xfs_buf_free_folios(bp);
-			return -ENOMEM;
-		}
-
-		XFS_STATS_INC(bp->b_mount, xb_page_retries);
-		memalloc_retry_wait(gfp_mask);
-	}
-
-	if (bp->b_folio_count == 1) {
-		/* A single folio buffer is always mappable */
-		bp->b_addr = folio_address(bp->b_folios[0]);
-	} else {
-		int retried = 0;
-		unsigned nofs_flag;
-
-		/*
-		 * vm_map_ram() will allocate auxiliary structures (e.g.
-		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
-		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
-		 * from the same call site that can be run from both above and
-		 * below memory reclaim causes lockdep false positives. Hence we
-		 * always need to force this allocation to nofs context because
-		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
-		 * prevent false positive lockdep reports.
-		 *
-		 * XXX(dgc): I think dquot reclaim is the only place we can get
-		 * to this function from memory reclaim context now. If we fix
-		 * that like we've fixed inode reclaim to avoid writeback from
-		 * reclaim, this nofs wrapping can go away.
-		 */
-		nofs_flag = memalloc_nofs_save();
-		do {
-			bp->b_addr = vm_map_ram((struct page **)bp->b_folios,
-					bp->b_folio_count, -1);
-			if (bp->b_addr)
-				break;
-			vm_unmap_aliases();
-		} while (retried++ <= 1);
-		memalloc_nofs_restore(nofs_flag);
-
-		if (!bp->b_addr) {
-			xfs_warn_ratelimited(bp->b_mount,
-				"%s: failed to map %u folios", __func__,
-				bp->b_folio_count);
-			xfs_buf_free_folios(bp);
-			return -ENOMEM;
-		}
+	nofs_flag = memalloc_nofs_save();
+	bp->b_addr = __vmalloc(BBTOB(bp->b_length), gfp_mask);
+	memalloc_nofs_restore(nofs_flag);
+
+	if (!bp->b_addr) {
+		xfs_warn_ratelimited(bp->b_mount,
+			"%s: failed to allocate %u folios", __func__,
+			bp->b_folio_count);
+		return -ENOMEM;
 	}
 
 	return 0;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4d515407713b..68c24947ca1a 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -190,8 +190,6 @@ struct xfs_buf {
 	struct xfs_buf_log_item	*b_log_item;
 	struct list_head	b_li_list;	/* Log items list head */
 	struct xfs_trans	*b_transp;
-	struct folio		**b_folios;	/* array of folio pointers */
-	struct folio		*b_folio_array[XB_FOLIOS]; /* inline folios */
 	struct xfs_buf_map	*b_maps;	/* compound buffer map */
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 26734c64c10e..336e7c8effb7 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -169,8 +169,6 @@ xmbuf_map_folio(
 	unlock_page(page);
 
 	bp->b_addr = page_address(page);
-	bp->b_folios = bp->b_folio_array;
-	bp->b_folios[0] = folio;
 	bp->b_folio_count = 1;
 	return 0;
 }
@@ -180,15 +178,10 @@ void
 xmbuf_unmap_folio(
 	struct xfs_buf		*bp)
 {
-	struct folio		*folio = bp->b_folios[0];
-
 	ASSERT(xfs_buftarg_is_mem(bp->b_target));
 
-	folio_put(folio);
-
+	folio_put(kmem_to_folio(bp->b_addr));
 	bp->b_addr = NULL;
-	bp->b_folios[0] = NULL;
-	bp->b_folios = NULL;
 	bp->b_folio_count = 0;
 }
 
-- 
2.43.0