Re: [PATCH v5 10/10] fs: add support for copy file range in zonefs

Nitesh Shetty <nj.shetty@xxxxxxxxxxx> · Tue, 29 Nov 2022 17:52:32 +0530

On Thu, Nov 24, 2022 at 10:47:55AM +0900, Damien Le Moal wrote:
> On 11/24/22 10:32, Damien Le Moal wrote:
> > On 11/23/22 14:58, Nitesh Shetty wrote:
> >> copy_file_range is implemented using copy offload,
> >> copy offloading to device is always enabled.
> >> To disable copy offloading mount with "no_copy_offload" mount option.
> > 
> > And were is the code that handle this option ?
> > 
> >> At present copy offload is only used, if the source and destination files
> >> are on same block device, otherwise copy file range is completed by
> >> generic copy file range.
> >>
> >> copy file range implemented as following:
> >> 	- write pending writes on the src and dest files
> >> 	- drop page cache for dest file if its conv zone
> >> 	- copy the range using offload
> >> 	- update dest file info
> >>
> >> For all failure cases we fallback to generic file copy range
> > 
> > For all cases ? That would be weird. What would be the point of trying to
> > copy again if e.g. the dest zone has gone offline or read only ?
> > 
> >> At present this implementation does not support conv aggregation
> > 
> > Please check this commit message overall: the grammar and punctuation
> > could really be improved.
> > 
> >>
> >> Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx>
> >> Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx>
> >> ---
> >>  fs/zonefs/super.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 179 insertions(+)
> >>
> >> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> >> index abc9a85106f2..15613433d4ae 100644
> >> --- a/fs/zonefs/super.c
> >> +++ b/fs/zonefs/super.c
> >> @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
> >>  	return 0;
> >>  }
> >>  
> >> +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> >> +		struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> >> +		size_t *len)
> >> +{
> >> +	loff_t size, endoff;
> >> +	struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> >> +
> >> +	inode_lock(src_inode);
> >> +	size = i_size_read(src_inode);
> >> +	inode_unlock(src_inode);
> >> +	/* Don't copy beyond source file EOF. */
> >> +	if (src_off < size) {
> >> +		if (src_off + *len > size)
> >> +			*len = (size - (src_off + *len));
> >> +	} else
> >> +		*len = 0;
> > 
> > Missing curly brackets for the else.
> > 
> >> +
> >> +	mutex_lock(&dst_zi->i_truncate_mutex);
> >> +	if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> >> +		if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> >> +			*len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> >> +
> >> +		if (dst_off != dst_zi->i_wpoffset)
> >> +			goto err;
> >> +	}
> >> +	mutex_unlock(&dst_zi->i_truncate_mutex);
> >> +
> >> +	endoff = dst_off + *len;
> >> +	inode_lock(dst_inode);
> >> +	if (endoff > dst_zi->i_max_size ||
> >> +			inode_newsize_ok(dst_inode, endoff)) {
> >> +		inode_unlock(dst_inode);
> >> +		goto err;
> > 
> > And here truncate mutex is not locked, but goto err will unlock it. This
> > is broken...
> > 
> >> +	}
> >> +	inode_unlock(dst_inode);
> > 
> > ...The locking is completely broken in this function anyway. You take the
> > lock, look at something, then release the lock. Then what if a write or a
> > trunctate comes in when the inode is not locked ? This is completely
> > broken. The inode should be locked with no dio pending when this function
> > is called. This is only to check if everything is ok. This has no business
> > playing with the inode and truncate locks.
> > 
> >> +
> >> +	return 0;
> >> +err:
> >> +	mutex_unlock(&dst_zi->i_truncate_mutex);
> >> +	return -EINVAL;
> >> +}
> >> +
> >> +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> >> +		loff_t src_off, struct zonefs_inode_info *dst_zi,
> >> +		loff_t dst_off, size_t len)
> >> +{
> >> +	struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> >> +	struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> >> +	struct range_entry *rlist = NULL;
> >> +	int ret = len;
> >> +
> >> +	rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> > 
> > GFP_NOIO ?
> > 
> >> +	if (!rlist)
> >> +		return -ENOMEM;
> >> +
> >> +	rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> >> +	rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> >> +	rlist[0].len = len;
> >> +	rlist[0].comp_len = 0;
> >> +	ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> >> +			GFP_KERNEL);
> >> +	if (rlist[0].comp_len > 0)
> >> +		ret = rlist[0].comp_len;
> >> +	kfree(rlist);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +/* Returns length of possible copy, else returns error */
> >> +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off,
> >> +					struct file *dst_file, loff_t dst_off,
> >> +					size_t *len, unsigned int flags)
> >> +{
> >> +	struct inode *src_inode = file_inode(src_file);
> >> +	struct inode *dst_inode = file_inode(dst_file);
> >> +	struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> >> +	struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> >> +	ssize_t ret;
> >> +
> >> +	if (src_inode->i_sb != dst_inode->i_sb)
> >> +		return -EXDEV;
> >> +
> >> +	/* Start by sync'ing the source and destination files for conv zones */
> >> +	if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> >> +		ret = file_write_and_wait_range(src_file, src_off,
> >> +				(src_off + *len));
> >> +		if (ret < 0)
> >> +			goto io_error;
> >> +	}
> >> +	inode_dio_wait(src_inode);
> > 
> > That is not a "check". So having this in a function called
> > zonefs_copy_file_checks() is a little strange.
> > 
> >> +
> >> +	/* Start by sync'ing the source and destination files ifor conv zones */
> > 
> > Same comment repeated, with typos.
> > 
> >> +	if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> >> +		ret = file_write_and_wait_range(dst_file, dst_off,
> >> +				(dst_off + *len));
> >> +		if (ret < 0)
> >> +			goto io_error;
> >> +	}
> >> +	inode_dio_wait(dst_inode);
> >> +
> >> +	/* Drop dst file cached pages for a conv zone*/
> >> +	if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> >> +		ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
> >> +				dst_off >> PAGE_SHIFT,
> >> +				(dst_off + *len) >> PAGE_SHIFT);
> >> +		if (ret < 0)
> >> +			goto io_error;
> >> +	}
> >> +
> >> +	ret = zonefs_is_file_copy_offset_ok(src_inode, dst_inode, src_off,
> >> +			dst_off, len);
> >> +	if (ret < 0)
> > 
> > if (ret)
> > 
> >> +		return ret;
> >> +
> >> +	return *len;
> >> +
> >> +io_error:
> >> +	zonefs_io_error(dst_inode, true);
> >> +	return ret;
> >> +}
> >> +
> >> +static ssize_t zonefs_copy_file(struct file *src_file, loff_t src_off,
> >> +		struct file *dst_file, loff_t dst_off,
> >> +		size_t len, unsigned int flags)
> >> +{
> >> +	struct inode *src_inode = file_inode(src_file);
> >> +	struct inode *dst_inode = file_inode(dst_file);
> >> +	struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> >> +	struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> >> +	ssize_t ret = 0, bytes;
> >> +
> >> +	inode_lock(src_inode);
> >> +	inode_lock(dst_inode);
> > 
> > So you did zonefs_copy_file_checks() outside of these locks, which mean
> > that everything about the source and destination files may have changed.
> > This does not work.
> 
> I forgot to mention that locking 2 inodes blindly like this can leads to
> deadlocks if another process tries a copy range from dst to src at the
> same time (lock order is reversed and so can deadlock).
> 
> > 
> >> +	bytes = zonefs_issue_copy(src_zi, src_off, dst_zi, dst_off, len);
> >> +	if (bytes < 0)
> >> +		goto unlock_exit;
> >> +
> >> +	ret += bytes;
> >> +
> >> +	file_update_time(dst_file);
> >> +	mutex_lock(&dst_zi->i_truncate_mutex);
> >> +	zonefs_update_stats(dst_inode, dst_off + bytes);
> >> +	zonefs_i_size_write(dst_inode, dst_off + bytes);
> >> +	dst_zi->i_wpoffset += bytes;
> > 
> > This is wierd. iszie for dst will be dst_zi->i_wpoffset. So please do:
> > 
> > 	dst_zi->i_wpoffset += bytes;
> > 	zonefs_i_size_write(dst_inode, dst_zi->i_wpoffset);
> > 
> >> +	mutex_unlock(&dst_zi->i_truncate_mutex);
> > 
> > And you are not taking care of the accounting for active zones here. If
> > the copy made the dst zone full, it is not active anymore. You need to
> > call zonefs_account_active();
> > 
> >> +	/* if we still have some bytes left, do splice copy */
> >> +	if (bytes && (bytes < len)) {
> >> +		bytes = do_splice_direct(src_file, &src_off, dst_file,
> >> +					 &dst_off, len, flags);
> > 
> > No way.
> > 
> >> +		if (bytes > 0)
> >> +			ret += bytes;
> >> +	}
> >> +unlock_exit:
> >> +	if (ret < 0)
> >> +		zonefs_io_error(dst_inode, true);
> > 
> > How can you be sure that you even did an IO when you get an error ?
> > zonefs_issue_copy() may have failed on its kmalloc() and no IO was done.
> > 
> >> +	inode_unlock(src_inode);
> >> +	inode_unlock(dst_inode);
> >> +	return ret;
> >> +}
> >> +
> >> +static ssize_t zonefs_copy_file_range(struct file *src_file, loff_t src_off,
> >> +				      struct file *dst_file, loff_t dst_off,
> >> +				      size_t len, unsigned int flags)
> >> +{
> >> +	ssize_t ret = -EIO;
> > 
> > This does not need to be initialized.
> > 
> >> +
> >> +	ret = zonefs_copy_file_checks(src_file, src_off, dst_file, dst_off,
> >> +				     &len, flags);
> > 
> > These checks need to be done for the generic implementation too, no ? Why
> > would checking this automatically trigger the offload ? What if the device
> > does not support offloading ?
> > 
> >> +	if (ret > 0)
> >> +		ret = zonefs_copy_file(src_file, src_off, dst_file, dst_off,
> >> +				     len, flags);
> > 
> > return here, then no need for the else. But see above. This seems all
> > broken to me.
> > 
> >> +	else if (ret < 0 && ret == -EXDEV)
> >> +		ret = generic_copy_file_range(src_file, src_off, dst_file,
> >> +					      dst_off, len, flags);
> >> +	return ret;
> >> +}
> >> +
> >>  static const struct file_operations zonefs_file_operations = {
> >>  	.open		= zonefs_file_open,
> >>  	.release	= zonefs_file_release,
> >> @@ -1234,6 +1411,7 @@ static const struct file_operations zonefs_file_operations = {
> >>  	.splice_read	= generic_file_splice_read,
> >>  	.splice_write	= iter_file_splice_write,
> >>  	.iopoll		= iocb_bio_iopoll,
> >> +	.copy_file_range = zonefs_copy_file_range,
> >>  };
> >>  
> >>  static struct kmem_cache *zonefs_inode_cachep;
> >> @@ -1804,6 +1982,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
> >>  	atomic_set(&sbi->s_active_seq_files, 0);
> >>  	sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
> >>  
> >> +	/* set copy support by default */
> > 
> > What is this comment supposed to be for ?
> > 
> >>  	ret = zonefs_read_super(sb);
> >>  	if (ret)
> >>  		return ret;
> > 
> 
> -- 
> Damien Le Moal
> Western Digital Research
> 
> 

Acked. I do see a gap in current zonefs cfr implementation. I will drop this
implementation for next version. Once we finalize on block copy offload
implementation, will re-pick this and send with above reviews fixed.

Thank you,
Nitesh