On Wed, Nov 23, 2022 at 08:53:14AM +0200, Amir Goldstein wrote: > On Wed, Nov 23, 2022 at 8:26 AM Nitesh Shetty <nj.shetty@xxxxxxxxxxx> wrote: > > > > copy_file_range is implemented using copy offload, > > copy offloading to device is always enabled. > > To disable copy offloading mount with "no_copy_offload" mount option. > > At present copy offload is only used, if the source and destination files > > are on same block device, otherwise copy file range is completed by > > generic copy file range. > > > > copy file range implemented as following: > > - write pending writes on the src and dest files > > - drop page cache for dest file if its conv zone > > - copy the range using offload > > - update dest file info > > > > For all failure cases we fallback to generic file copy range > > At present this implementation does not support conv aggregation > > > > Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> > > Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> > > --- > > fs/zonefs/super.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 179 insertions(+) > > > > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c > > index abc9a85106f2..15613433d4ae 100644 > > --- a/fs/zonefs/super.c > > +++ b/fs/zonefs/super.c > > @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, struct file *file) > > return 0; > > } > > > > +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode, > > + struct inode *dst_inode, loff_t src_off, loff_t dst_off, > > + size_t *len) > > +{ > > + loff_t size, endoff; > > + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode); > > + > > + inode_lock(src_inode); > > + size = i_size_read(src_inode); > > + inode_unlock(src_inode); > > + /* Don't copy beyond source file EOF. */ > > + if (src_off < size) { > > + if (src_off + *len > size) > > + *len = (size - (src_off + *len)); > > + } else > > + *len = 0; > > + > > + mutex_lock(&dst_zi->i_truncate_mutex); > > + if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) { > > + if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset) > > + *len -= dst_zi->i_max_size - dst_zi->i_wpoffset; > > + > > + if (dst_off != dst_zi->i_wpoffset) > > + goto err; > > + } > > + mutex_unlock(&dst_zi->i_truncate_mutex); > > + > > + endoff = dst_off + *len; > > + inode_lock(dst_inode); > > + if (endoff > dst_zi->i_max_size || > > + inode_newsize_ok(dst_inode, endoff)) { > > + inode_unlock(dst_inode); > > + goto err; > > + } > > + inode_unlock(dst_inode); > > + > > + return 0; > > +err: > > + mutex_unlock(&dst_zi->i_truncate_mutex); > > + return -EINVAL; > > +} > > + > > +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi, > > + loff_t src_off, struct zonefs_inode_info *dst_zi, > > + loff_t dst_off, size_t len) > > +{ > > + struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev; > > + struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev; > > + struct range_entry *rlist = NULL; > > + int ret = len; > > + > > + rlist = kmalloc(sizeof(*rlist), GFP_KERNEL); > > + if (!rlist) > > + return -ENOMEM; > > + > > + rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off; > > + rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off; > > + rlist[0].len = len; > > + rlist[0].comp_len = 0; > > + ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL, > > + GFP_KERNEL); > > + if (rlist[0].comp_len > 0) > > + ret = rlist[0].comp_len; > > + kfree(rlist); > > + > > + return ret; > > +} > > + > > +/* Returns length of possible copy, else returns error */ > > +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off, > > + struct file *dst_file, loff_t dst_off, > > + size_t *len, unsigned int flags) > > +{ > > + struct inode *src_inode = file_inode(src_file); > > + struct inode *dst_inode = file_inode(dst_file); > > + struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode); > > + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode); > > + ssize_t ret; > > + > > + if (src_inode->i_sb != dst_inode->i_sb) > > + return -EXDEV; > > + > > + /* Start by sync'ing the source and destination files for conv zones */ > > + if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) { > > + ret = file_write_and_wait_range(src_file, src_off, > > + (src_off + *len)); > > + if (ret < 0) > > + goto io_error; > > + } > > + inode_dio_wait(src_inode); > > + > > + /* Start by sync'ing the source and destination files ifor conv zones */ > > + if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) { > > + ret = file_write_and_wait_range(dst_file, dst_off, > > + (dst_off + *len)); > > + if (ret < 0) > > + goto io_error; > > + } > > + inode_dio_wait(dst_inode); > > + > > + /* Drop dst file cached pages for a conv zone*/ > > + if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) { > > + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, > > + dst_off >> PAGE_SHIFT, > > + (dst_off + *len) >> PAGE_SHIFT); > > + if (ret < 0) > > + goto io_error; > > + } > > + > > + ret = zonefs_is_file_copy_offset_ok(src_inode, dst_inode, src_off, > > + dst_off, len); > > + if (ret < 0) > > + return ret; > > + > > + return *len; > > + > > +io_error: > > + zonefs_io_error(dst_inode, true); > > + return ret; > > +} > > + > > +static ssize_t zonefs_copy_file(struct file *src_file, loff_t src_off, > > + struct file *dst_file, loff_t dst_off, > > + size_t len, unsigned int flags) > > +{ > > + struct inode *src_inode = file_inode(src_file); > > + struct inode *dst_inode = file_inode(dst_file); > > + struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode); > > + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode); > > + ssize_t ret = 0, bytes; > > + > > + inode_lock(src_inode); > > + inode_lock(dst_inode); > > + bytes = zonefs_issue_copy(src_zi, src_off, dst_zi, dst_off, len); > > + if (bytes < 0) > > + goto unlock_exit; > > + > > + ret += bytes; > > + > > + file_update_time(dst_file); > > + mutex_lock(&dst_zi->i_truncate_mutex); > > + zonefs_update_stats(dst_inode, dst_off + bytes); > > + zonefs_i_size_write(dst_inode, dst_off + bytes); > > + dst_zi->i_wpoffset += bytes; > > + mutex_unlock(&dst_zi->i_truncate_mutex); > > + /* if we still have some bytes left, do splice copy */ > > + if (bytes && (bytes < len)) { > > + bytes = do_splice_direct(src_file, &src_off, dst_file, > > + &dst_off, len, flags); > > + if (bytes > 0) > > + ret += bytes; > > + } > > +unlock_exit: > > + if (ret < 0) > > + zonefs_io_error(dst_inode, true); > > + inode_unlock(src_inode); > > + inode_unlock(dst_inode); > > + return ret; > > +} > > + > > +static ssize_t zonefs_copy_file_range(struct file *src_file, loff_t src_off, > > + struct file *dst_file, loff_t dst_off, > > + size_t len, unsigned int flags) > > +{ > > + ssize_t ret = -EIO; > > + > > + ret = zonefs_copy_file_checks(src_file, src_off, dst_file, dst_off, > > + &len, flags); > > + if (ret > 0) > > + ret = zonefs_copy_file(src_file, src_off, dst_file, dst_off, > > + len, flags); > > + else if (ret < 0 && ret == -EXDEV) > > First of all, ret < 0 is redundant. > acked > > + ret = generic_copy_file_range(src_file, src_off, dst_file, > > + dst_off, len, flags); > > But more importantly, why do you want to fall back to > do_splice_direct() in zonefs copy_file_range? > How does it serve your patch set or the prospect consumers > of zonefs copy_file_range? > > The reason I am asking is because commit 5dae222a5ff0 > ("vfs: allow copy_file_range to copy across devices") > turned out to be an API mistake that was later reverted by > 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies") > > It is always better to return EXDEV to userspace which can > always fallback to splice itself, but maybe it has something > smarter to do. > > The places where it made sense for kernel to fallback to > direct splice was for network servers server-side-copy, but that > is independent of any specific filesystem copy_file_range() > implementation. > > Thanks, > Amir. > At present we don't handle few case's such as IO getting split incase of copy offload, so we wanted to fallback to existing mechanism. So went with default operation, do_splice_direct. Regards, Nitesh Shetty