Re: [RFC PATCH v4 24/34] ext4: implement buffered write iomap path

Dave Chinner <david@xxxxxxxxxxxxx> · Wed, 1 May 2024 18:11:13 +1000

On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@xxxxxxxxxx>
> 
> Implement buffered write iomap path, use ext4_da_map_blocks() to map
> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
> delalloc is disabled or free space is about to run out.
> 
> Note that we always allocate unwritten extents for new blocks in the
> iomap write path, this means that the allocation type is no longer
> controlled by the dioread_nolock mount option. After that, we could
> postpone the i_disksize updating to the writeback path, and drop journal
> handle in the buffered dealloc write path completely.
> 
> Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx>
> ---
>  fs/ext4/ext4.h  |   3 +
>  fs/ext4/file.c  |  19 +++++-
>  fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 183 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 05949a8136ae..2bd543c43341 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>  				     struct buffer_head *bh));
>  int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>  				struct buffer_head *bh);
> +int ext4_nonda_switch(struct super_block *sb);
>  #define FALL_BACK_TO_NONDELALLOC 1
>  #define CONVERT_INLINE_DATA	 2
>  
> @@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>  extern const struct iomap_ops ext4_iomap_ops;
>  extern const struct iomap_ops ext4_iomap_overwrite_ops;
>  extern const struct iomap_ops ext4_iomap_report_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
>  
>  static inline int ext4_buffer_uptodate(struct buffer_head *bh)
>  {
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 54d6ff22585c..52f37c49572a 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	return count;
>  }
>  
> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
> +					 struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	const struct iomap_ops *iomap_ops;
> +
> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
> +	else
> +		iomap_ops = &ext4_iomap_buffered_write_ops;
> +
> +	return iomap_file_buffered_write(iocb, from, iomap_ops);
> +}
> +
>  static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  					struct iov_iter *from)
>  {
> @@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  	if (ret <= 0)
>  		goto out;
>  
> -	ret = generic_perform_write(iocb, from);
> +	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
> +		ret = ext4_iomap_buffered_write(iocb, from);
> +	else
> +		ret = generic_perform_write(iocb, from);
>  
>  out:
>  	inode_unlock(inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 20eb772f4f62..e825ed16fd60 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2857,7 +2857,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
>  	return ret;
>  }
>  
> -static int ext4_nonda_switch(struct super_block *sb)
> +int ext4_nonda_switch(struct super_block *sb)
>  {
>  	s64 free_clusters, dirty_clusters;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -3254,6 +3254,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
>  	return inode->i_state & I_DIRTY_DATASYNC;
>  }
>  
> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
> +{
> +	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
> +}
> +
> +static const struct iomap_folio_ops ext4_iomap_folio_ops = {
> +	.iomap_valid = ext4_iomap_valid,
> +};
> +
>  static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>  			   struct ext4_map_blocks *map, loff_t offset,
>  			   loff_t length, unsigned int flags)
> @@ -3284,6 +3293,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>  	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>  		iomap->flags |= IOMAP_F_MERGED;
>  
> +	iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
> +	iomap->folio_ops = &ext4_iomap_folio_ops;
> +
>  	/*
>  	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
>  	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
> @@ -3523,11 +3535,42 @@ const struct iomap_ops ext4_iomap_report_ops = {
>  	.iomap_begin = ext4_iomap_begin_report,
>  };
>  
> -static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
> +static int ext4_iomap_get_blocks(struct inode *inode,
> +				 struct ext4_map_blocks *map)
> +{
> +	handle_t *handle;
> +	int ret, needed_blocks;
> +
> +	/*
> +	 * Reserve one block more for addition to orphan list in case
> +	 * we allocate blocks but write fails for some reason.
> +	 */
> +	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
> +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
> +
> +	ret = ext4_map_blocks(handle, inode, map,
> +			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
> +	/*
> +	 * Have to stop journal here since there is a potential deadlock
> +	 * caused by later balance_dirty_pages(), it might wait on the
> +	 * ditry pages to be written back, which might start another
> +	 * handle and wait this handle stop.
> +	 */
> +	ext4_journal_stop(handle);
> +
> +	return ret;
> +}
> +
> +#define IOMAP_F_EXT4_DELALLOC		IOMAP_F_PRIVATE
> +
> +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>  				loff_t length, unsigned int iomap_flags,
> -				struct iomap *iomap, struct iomap *srcmap)
> +				struct iomap *iomap, struct iomap *srcmap,
> +				bool delalloc)
>  {
> -	int ret;
> +	int ret, retries = 0;
>  	struct ext4_map_blocks map;
>  	u8 blkbits = inode->i_blkbits;
>  
> @@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>  		return -EINVAL;
>  	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>  		return -ERANGE;
> -
> +retry:
>  	/* Calculate the first and last logical blocks respectively. */
>  	map.m_lblk = offset >> blkbits;
>  	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>  			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
> +	if (iomap_flags & IOMAP_WRITE) {
> +		if (delalloc)
> +			ret = ext4_da_map_blocks(inode, &map);
> +		else
> +			ret = ext4_iomap_get_blocks(inode, &map);
>  
> -	ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret == -ENOSPC &&
> +		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +			goto retry;
> +	} else {
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +	}
>  	if (ret < 0)
>  		return ret;
>  
>  	ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
> +	if (delalloc)
> +		iomap->flags |= IOMAP_F_EXT4_DELALLOC;
> +
> +	return 0;
> +}

Why are you implementing both read and write mapping paths in
the one function? The whole point of having separate ops vectors for
read and write is that it allows a clean separation of the read and
write mapping operations. i.e. there is no need to use "if (write)
else {do read}" code constructs at all.

You can even have a different delalloc mapping function so you don't
need "if (delalloc) else {do nonda}" branches everiywhere...

> +
> +static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
> +			loff_t offset, loff_t length, unsigned int flags,
> +			struct iomap *iomap, struct iomap *srcmap)
> +{
> +	return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
> +					      iomap, srcmap, false);
> +}
> +
> +static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
> +			loff_t offset, loff_t length, unsigned int flags,
> +			struct iomap *iomap, struct iomap *srcmap)
> +{
> +	return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
> +					      iomap, srcmap, true);
> +}
> +
> +/*
> + * Drop the staled delayed allocation range from the write failure,
> + * including both start and end blocks. If not, we could leave a range
> + * of delayed extents covered by a clean folio, it could lead to
> + * inaccurate space reservation.
> + */
> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
> +				     loff_t length)
> +{
> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>  	return 0;
>  }
>  
> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
> +					 loff_t length, ssize_t written,
> +					 unsigned int flags,
> +					 struct iomap *iomap)
> +{
> +	handle_t *handle;
> +	loff_t end;
> +	int ret = 0, ret2;
> +
> +	/* delalloc */
> +	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
> +		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
> +			offset, length, written, ext4_iomap_punch_delalloc);
> +		if (ret)
> +			ext4_warning(inode->i_sb,
> +			     "Failed to clean up delalloc for inode %lu, %d",
> +			     inode->i_ino, ret);
> +		return ret;
> +	}

Why are you creating a delalloc extent for the write operation and
then immediately deleting it from the extent tree once the write
operation is done? Delayed allocation is a state that persists for
that file range until the data is written back, right, but this
appears to return the range to a hole in the extent state tree?

What happens when we do a second write to this same range? WIll it
do another delayed allocation because there are no extents over this
range?

Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?

-Dave.

-- 
Dave Chinner
david@xxxxxxxxxxxxx