Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Wed, 5 Mar 2025 10:22:21 -0800

On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote:
> The fallback buffer allocation path currently open codes a suboptimal
> version of vmalloc to allocate pages that are then mapped into
> vmalloc space.  Switch to using vmalloc instead, which uses all the
> optimizations in the common vmalloc code, and removes the need to
> track the backing pages in the xfs_buf structure.
> 
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>

Looks good now,
Reviewed-by: "Darrick J. Wong" <djwong@xxxxxxxxxx>

--D

> ---
>  fs/xfs/xfs_buf.c     | 207 ++++++++++---------------------------------
>  fs/xfs/xfs_buf.h     |   7 --
>  fs/xfs/xfs_buf_mem.c |  11 +--
>  3 files changed, 48 insertions(+), 177 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 2b4b8c104b0c..f28ca5cb5bd8 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
>  	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
>  }
>  
> -static inline int
> -xfs_buf_vmap_len(
> -	struct xfs_buf	*bp)
> -{
> -	return (bp->b_page_count * PAGE_SIZE);
> -}
> -
>  /*
>   * When we mark a buffer stale, we remove the buffer from the LRU and clear the
>   * b_lru_ref count so that the buffer is freed immediately when the buffer
> @@ -190,29 +183,6 @@ _xfs_buf_alloc(
>  	return 0;
>  }
>  
> -static void
> -xfs_buf_free_pages(
> -	struct xfs_buf	*bp)
> -{
> -	uint		i;
> -
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> -
> -	if (is_vmalloc_addr(bp->b_addr))
> -		vm_unmap_ram(bp->b_addr, bp->b_page_count);
> -
> -	for (i = 0; i < bp->b_page_count; i++) {
> -		if (bp->b_pages[i])
> -			folio_put(page_folio(bp->b_pages[i]));
> -	}
> -	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
> -
> -	if (bp->b_pages != bp->b_page_array)
> -		kfree(bp->b_pages);
> -	bp->b_pages = NULL;
> -	bp->b_flags &= ~_XBF_PAGES;
> -}
> -
>  static void
>  xfs_buf_free_callback(
>  	struct callback_head	*cb)
> @@ -227,16 +197,23 @@ static void
>  xfs_buf_free(
>  	struct xfs_buf		*bp)
>  {
> +	unsigned int		size = BBTOB(bp->b_length);
> +
>  	trace_xfs_buf_free(bp, _RET_IP_);
>  
>  	ASSERT(list_empty(&bp->b_lru));
>  
> +	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
> +		mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
> +
>  	if (xfs_buftarg_is_mem(bp->b_target))
>  		xmbuf_unmap_page(bp);
> -	else if (bp->b_flags & _XBF_PAGES)
> -		xfs_buf_free_pages(bp);
> +	else if (is_vmalloc_addr(bp->b_addr))
> +		vfree(bp->b_addr);
>  	else if (bp->b_flags & _XBF_KMEM)
>  		kfree(bp->b_addr);
> +	else
> +		folio_put(virt_to_folio(bp->b_addr));
>  
>  	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
>  }
> @@ -264,9 +241,6 @@ xfs_buf_alloc_kmem(
>  		bp->b_addr = NULL;
>  		return -ENOMEM;
>  	}
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_pages[0] = kmem_to_page(bp->b_addr);
> -	bp->b_page_count = 1;
>  	bp->b_flags |= _XBF_KMEM;
>  	return 0;
>  }
> @@ -287,9 +261,9 @@ xfs_buf_alloc_kmem(
>   * by the rest of the code - the buffer memory spans a single contiguous memory
>   * region that we don't have to map and unmap to access the data directly.
>   *
> - * The third type of buffer is the multi-page buffer. These are always made
> - * up of single pages so that they can be fed to vmap_ram() to return a
> - * contiguous memory region we can access the data through.
> + * The third type of buffer is the vmalloc()d buffer. This provides the buffer
> + * with the required contiguous memory region but backed by discontiguous
> + * physical pages.
>   */
>  static int
>  xfs_buf_alloc_backing_mem(
> @@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem(
>  	size_t		size = BBTOB(bp->b_length);
>  	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
>  	struct folio	*folio;
> -	long		filled = 0;
>  
>  	if (xfs_buftarg_is_mem(bp->b_target))
>  		return xmbuf_map_page(bp);
> @@ -347,98 +320,18 @@ xfs_buf_alloc_backing_mem(
>  		goto fallback;
>  	}
>  	bp->b_addr = folio_address(folio);
> -	bp->b_page_array[0] = &folio->page;
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_page_count = 1;
> -	bp->b_flags |= _XBF_PAGES;
>  	return 0;
>  
>  fallback:
> -	/* Fall back to allocating an array of single page folios. */
> -	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
> -	if (bp->b_page_count <= XB_PAGES) {
> -		bp->b_pages = bp->b_page_array;
> -	} else {
> -		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
> -					gfp_mask);
> -		if (!bp->b_pages)
> -			return -ENOMEM;
> -	}
> -	bp->b_flags |= _XBF_PAGES;
> -
> -	/*
> -	 * Bulk filling of pages can take multiple calls. Not filling the entire
> -	 * array is not an allocation failure, so don't back off if we get at
> -	 * least one extra page.
> -	 */
>  	for (;;) {
> -		long	last = filled;
> -
> -		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
> -					  bp->b_pages);
> -		if (filled == bp->b_page_count) {
> -			XFS_STATS_INC(bp->b_mount, xb_page_found);
> +		bp->b_addr = __vmalloc(size, gfp_mask);
> +		if (bp->b_addr)
>  			break;
> -		}
> -
> -		if (filled != last)
> -			continue;
> -
> -		if (flags & XBF_READ_AHEAD) {
> -			xfs_buf_free_pages(bp);
> +		if (flags & XBF_READ_AHEAD)
>  			return -ENOMEM;
> -		}
> -
>  		XFS_STATS_INC(bp->b_mount, xb_page_retries);
>  		memalloc_retry_wait(gfp_mask);
>  	}
> -	return 0;
> -}
> -
> -/*
> - *	Map buffer into kernel address-space if necessary.
> - */
> -STATIC int
> -_xfs_buf_map_pages(
> -	struct xfs_buf		*bp,
> -	xfs_buf_flags_t		flags)
> -{
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> -	if (bp->b_page_count == 1) {
> -		/* A single page buffer is always mappable */
> -		bp->b_addr = page_address(bp->b_pages[0]);
> -	} else {
> -		int retried = 0;
> -		unsigned nofs_flag;
> -
> -		/*
> -		 * vm_map_ram() will allocate auxiliary structures (e.g.
> -		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
> -		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
> -		 * from the same call site that can be run from both above and
> -		 * below memory reclaim causes lockdep false positives. Hence we
> -		 * always need to force this allocation to nofs context because
> -		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
> -		 * prevent false positive lockdep reports.
> -		 *
> -		 * XXX(dgc): I think dquot reclaim is the only place we can get
> -		 * to this function from memory reclaim context now. If we fix
> -		 * that like we've fixed inode reclaim to avoid writeback from
> -		 * reclaim, this nofs wrapping can go away.
> -		 */
> -		nofs_flag = memalloc_nofs_save();
> -		do {
> -			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
> -						-1);
> -			if (bp->b_addr)
> -				break;
> -			vm_unmap_aliases();
> -		} while (retried++ <= 1);
> -		memalloc_nofs_restore(nofs_flag);
> -
> -		if (!bp->b_addr)
> -			return -ENOMEM;
> -	}
>  
>  	return 0;
>  }
> @@ -558,7 +451,7 @@ xfs_buf_find_lock(
>  			return -ENOENT;
>  		}
>  		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
> -		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
> +		bp->b_flags &= _XBF_KMEM;
>  		bp->b_ops = NULL;
>  	}
>  	return 0;
> @@ -744,18 +637,6 @@ xfs_buf_get_map(
>  			xfs_perag_put(pag);
>  	}
>  
> -	/* We do not hold a perag reference anymore. */
> -	if (!bp->b_addr) {
> -		error = _xfs_buf_map_pages(bp, flags);
> -		if (unlikely(error)) {
> -			xfs_warn_ratelimited(btp->bt_mount,
> -				"%s: failed to map %u pages", __func__,
> -				bp->b_page_count);
> -			xfs_buf_relse(bp);
> -			return error;
> -		}
> -	}
> -
>  	/*
>  	 * Clear b_error if this is a lookup from a caller that doesn't expect
>  	 * valid data to be found in the buffer.
> @@ -998,14 +879,6 @@ xfs_buf_get_uncached(
>  	if (error)
>  		goto fail_free_buf;
>  
> -	if (!bp->b_addr)
> -		error = _xfs_buf_map_pages(bp, 0);
> -	if (unlikely(error)) {
> -		xfs_warn(target->bt_mount,
> -			"%s: failed to map pages", __func__);
> -		goto fail_free_buf;
> -	}
> -
>  	trace_xfs_buf_get_uncached(bp, _RET_IP_);
>  	*bpp = bp;
>  	return 0;
> @@ -1339,7 +1212,7 @@ __xfs_buf_ioend(
>  	if (bp->b_flags & XBF_READ) {
>  		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
>  			invalidate_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +				roundup(BBTOB(bp->b_length), PAGE_SIZE));
>  		if (!bp->b_error && bp->b_ops)
>  			bp->b_ops->verify_read(bp);
>  		if (!bp->b_error)
> @@ -1500,29 +1373,43 @@ static void
>  xfs_buf_submit_bio(
>  	struct xfs_buf		*bp)
>  {
> -	unsigned int		size = BBTOB(bp->b_length);
> -	unsigned int		map = 0, p;
> +	unsigned int		map = 0;
>  	struct blk_plug		plug;
>  	struct bio		*bio;
>  
> -	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> -			xfs_buf_bio_op(bp), GFP_NOIO);
> -	bio->bi_private = bp;
> -	bio->bi_end_io = xfs_buf_bio_end_io;
> +	if (is_vmalloc_addr(bp->b_addr)) {
> +		unsigned int	size = BBTOB(bp->b_length);
> +		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
> +		void		*data = bp->b_addr;
>  
> -	if (bp->b_page_count == 1) {
> -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> -				offset_in_page(bp->b_addr));
> -	} else {
> -		for (p = 0; p < bp->b_page_count; p++)
> -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> +		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
> +				xfs_buf_bio_op(bp), GFP_NOIO);
> +
> +		do {
> +			unsigned int	len = min(size, PAGE_SIZE);
>  
> -		if (is_vmalloc_addr(bp->b_addr))
> -			flush_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +			ASSERT(offset_in_page(data) == 0);
> +			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
> +			data += len;
> +			size -= len;
> +		} while (size);
> +
> +		flush_kernel_vmap_range(bp->b_addr, alloc_size);
> +	} else {
> +		/*
> +		 * Single folio or slab allocation.  Must be contiguous and thus
> +		 * only a single bvec is needed.
> +		 */
> +		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
> +				GFP_NOIO);
> +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> +				BBTOB(bp->b_length),
> +				offset_in_page(bp->b_addr));
>  	}
>  
> +	bio->bi_private = bp;
> +	bio->bi_end_io = xfs_buf_bio_end_io;
> +
>  	/*
>  	 * If there is more than one map segment, split out a new bio for each
>  	 * map except of the last one.  The last map is handled by the
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 8db522f19b0c..db43bdc17f55 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -36,7 +36,6 @@ struct xfs_buf;
>  #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
>  
>  /* flags used only internally */
> -#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
>  #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
>  #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
>  
> @@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_STALE,		"STALE" }, \
>  	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
>  	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
> -	{ _XBF_PAGES,		"PAGES" }, \
>  	{ _XBF_KMEM,		"KMEM" }, \
>  	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
>  	/* The following interface flags should never be set */ \
> @@ -122,8 +120,6 @@ struct xfs_buftarg {
>  	struct xfs_buf_cache	bt_cache[];
>  };
>  
> -#define XB_PAGES	2
> -
>  struct xfs_buf_map {
>  	xfs_daddr_t		bm_bn;	/* block number for I/O */
>  	int			bm_len;	/* size of I/O */
> @@ -185,13 +181,10 @@ struct xfs_buf {
>  	struct xfs_buf_log_item	*b_log_item;
>  	struct list_head	b_li_list;	/* Log items list head */
>  	struct xfs_trans	*b_transp;
> -	struct page		**b_pages;	/* array of page pointers */
> -	struct page		*b_page_array[XB_PAGES]; /* inline pages */
>  	struct xfs_buf_map	*b_maps;	/* compound buffer map */
>  	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
>  	int			b_map_count;
>  	atomic_t		b_pin_count;	/* pin count */
> -	unsigned int		b_page_count;	/* size of page array */
>  	int			b_error;	/* error code on I/O */
>  	void			(*b_iodone)(struct xfs_buf *bp);
>  
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 5b64a2b3b113..b207754d2ee0 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -169,9 +169,6 @@ xmbuf_map_page(
>  	unlock_page(page);
>  
>  	bp->b_addr = page_address(page);
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_pages[0] = page;
> -	bp->b_page_count = 1;
>  	return 0;
>  }
>  
> @@ -180,16 +177,10 @@ void
>  xmbuf_unmap_page(
>  	struct xfs_buf		*bp)
>  {
> -	struct page		*page = bp->b_pages[0];
> -
>  	ASSERT(xfs_buftarg_is_mem(bp->b_target));
>  
> -	put_page(page);
> -
> +	put_page(virt_to_page(bp->b_addr));
>  	bp->b_addr = NULL;
> -	bp->b_pages[0] = NULL;
> -	bp->b_pages = NULL;
> -	bp->b_page_count = 0;
>  }
>  
>  /* Is this a valid daddr within the buftarg? */
> -- 
> 2.45.2
> 
>