Re: [PATCH 4/9] xfs,iomap: move delalloc punching to iomap

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Thu, 17 Nov 2022 09:57:13 -0800

On Thu, Nov 17, 2022 at 04:58:05PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Because that's what Christoph wants for this error handling path
> only XFS uses.
> 
> It requires a new iomap export for handling errors over delalloc
> ranges. This is basically the XFS code as is stands, but even though
> Christoph wants this as iomap funcitonality, we still have
> to call it from the filesystem specific ->iomap_end callback, and
> call into the iomap code with yet another filesystem specific
> callback to punch the delalloc extent within the defined ranges.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/iomap/buffered-io.c | 60 ++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iomap.c     | 47 ++++++---------------------------
>  include/linux/iomap.h  |  6 +++++
>  3 files changed, 74 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 91ee0b308e13..77f391fd90ca 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -832,6 +832,66 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
>  }
>  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
>  
> +/*
> + * When a short write occurs, the filesystem may need to remove reserved space
> + * that was allocated in ->iomap_begin from it's ->iomap_end method. For
> + * filesystems that use delayed allocation, we need to punch out delalloc
> + * extents from the range that are not dirty in the page cache. As the write can
> + * race with page faults, there can be dirty pages over the delalloc extent
> + * outside the range of a short write but still within the delalloc extent
> + * allocated for this iomap.
> + *
> + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
> + * simplify range iterations, but converts them back to {offset,len} tuples for
> + * the punch callback.
> + */
> +int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
> +		struct iomap *iomap, loff_t offset, loff_t length,

Nit: loff_t pos, not 'offset', to (try to) be consistent with the rest
of the codebase.

I'll fix this on commit if you don't beat me to the punch, so
Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx>

--D

> +		ssize_t written,
> +		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
> +{
> +	loff_t			start_byte;
> +	loff_t			end_byte;
> +	int			blocksize = 1 << inode->i_blkbits;
> +	int			error = 0;
> +
> +	if (iomap->type != IOMAP_DELALLOC)
> +		return 0;
> +
> +	/* If we didn't reserve the blocks, we're not allowed to punch them. */
> +	if (!(iomap->flags & IOMAP_F_NEW))
> +		return 0;
> +
> +	/*
> +	 * start_byte refers to the first unused block after a short write. If
> +	 * nothing was written, round offset down to point at the first block in
> +	 * the range.
> +	 */
> +	if (unlikely(!written))
> +		start_byte = round_down(offset, blocksize);
> +	else
> +		start_byte = round_up(offset + written, blocksize);
> +	end_byte = round_up(offset + length, blocksize);
> +
> +	/* Nothing to do if we've written the entire delalloc extent */
> +	if (start_byte >= end_byte)
> +		return 0;
> +
> +	/*
> +	 * Lock the mapping to avoid races with page faults re-instantiating
> +	 * folios and dirtying them via ->page_mkwrite between the page cache
> +	 * truncation and the delalloc extent removal. Failing to do this can
> +	 * leave dirty pages with no space reservation in the cache.
> +	 */
> +	filemap_invalidate_lock(inode->i_mapping);
> +	truncate_pagecache_range(inode, start_byte, end_byte - 1);
> +	error = punch(inode, start_byte, end_byte - start_byte);
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	return error;
> +}
> +EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
> +
>  static loff_t iomap_unshare_iter(struct iomap_iter *iter)
>  {
>  	struct iomap *iomap = &iter->iomap;
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 7bb55dbc19d3..ea96e8a34868 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1123,12 +1123,12 @@ xfs_buffered_write_iomap_begin(
>  static int
>  xfs_buffered_write_delalloc_punch(
>  	struct inode		*inode,
> -	loff_t			start_byte,
> -	loff_t			end_byte)
> +	loff_t			offset,
> +	loff_t			length)
>  {
>  	struct xfs_mount	*mp = XFS_M(inode->i_sb);
> -	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, start_byte);
> -	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, end_byte);
> +	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
> +	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
>  
>  	return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
>  				end_fsb - start_fsb);
> @@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end(
>  	unsigned		flags,
>  	struct iomap		*iomap)
>  {
> -	struct xfs_mount	*mp = XFS_M(inode->i_sb);
> -	loff_t			start_byte;
> -	loff_t			end_byte;
> -	int			error = 0;
>  
> -	if (iomap->type != IOMAP_DELALLOC)
> -		return 0;
> +	struct xfs_mount	*mp = XFS_M(inode->i_sb);
> +	int			error;
>  
>  	/*
>  	 * Behave as if the write failed if drop writes is enabled. Set the NEW
> @@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end(
>  		written = 0;
>  	}
>  
> -	/* If we didn't reserve the blocks, we're not allowed to punch them. */
> -	if (!(iomap->flags & IOMAP_F_NEW))
> -		return 0;
> -
> -	/*
> -	 * start_fsb refers to the first unused block after a short write. If
> -	 * nothing was written, round offset down to point at the first block in
> -	 * the range.
> -	 */
> -	if (unlikely(!written))
> -		start_byte = round_down(offset, mp->m_sb.sb_blocksize);
> -	else
> -		start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
> -	end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
> -
> -	/* Nothing to do if we've written the entire delalloc extent */
> -	if (start_byte >= end_byte)
> -		return 0;
> -
> -	/*
> -	 * Lock the mapping to avoid races with page faults re-instantiating
> -	 * folios and dirtying them via ->page_mkwrite between the page cache
> -	 * truncation and the delalloc extent removal. Failing to do this can
> -	 * leave dirty pages with no space reservation in the cache.
> -	 */
> -	filemap_invalidate_lock(inode->i_mapping);
> -	truncate_pagecache_range(inode, start_byte, end_byte - 1);
> -	error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
> -	filemap_invalidate_unlock(inode->i_mapping);
> +	error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
> +			length, written, &xfs_buffered_write_delalloc_punch);
>  	if (error && !xfs_is_shutdown(mp)) {
>  		xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
>  			__func__, XFS_I(inode)->i_ino);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 238a03087e17..6bbed915c83a 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -226,6 +226,12 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
>  
>  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
>  		const struct iomap_ops *ops);
> +int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
> +		struct iomap *iomap, loff_t offset, loff_t length,
> +		ssize_t written,
> +		int (*punch)(struct inode *inode,
> +				loff_t offset, loff_t length));
> +
>  int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
>  void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
>  bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
> -- 
> 2.37.2
>