[PATCH v4 10/10] fs: add support for copy file range in zonefs

Nitesh Shetty <nj.shetty@xxxxxxxxxxx> · Tue, 26 Apr 2022 15:42:38 +0530

From: Arnav Dawn <arnav.dawn@xxxxxxxxxxx>

copy_file_range is implemented using copy offload,
copy offloading to device is always enabled.
To disable copy offloading mount with "no_copy_offload" mount option.
At present copy offload is only used, if the source and destination files
are on same block device, otherwise copy file range is completed by
generic copy file range.

copy file range implemented as following:
	- write pending writes on the src and dest files
	- drop page cache for dest file if its conv zone
	- copy the range using offload
	- update dest file info

For all failure cases we fallback to generic file copy range
At present this implementation does not support conv aggregation

Signed-off-by: Arnav Dawn <arnav.dawn@xxxxxxxxxxx>
---
 fs/zonefs/super.c  | 178 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/zonefs/zonefs.h |   1 +
 2 files changed, 178 insertions(+), 1 deletion(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b3b0b71fdf6c..60563b592bf2 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -901,6 +901,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 	else
 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
 				   &zonefs_write_dio_ops, 0, 0);
+
 	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
 	    (ret > 0 || ret == -EIOCBQUEUED)) {
 		if (ret > 0)
@@ -1189,6 +1190,171 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int zonefs_is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
+			   loff_t src_off, loff_t dst_off, size_t len)
+{
+	loff_t size, endoff;
+
+	size = i_size_read(src_inode);
+	/* Don't copy beyond source file EOF. */
+	if (src_off + len > size) {
+		zonefs_err(src_inode->i_sb, "Copy beyond EOF (%llu + %zu > %llu)\n",
+		     src_off, len, size);
+		return -EOPNOTSUPP;
+	}
+
+	endoff = dst_off + len;
+	if (inode_newsize_ok(dst_inode, endoff))
+		return -EOPNOTSUPP;
+
+
+	return 0;
+}
+static ssize_t __zonefs_send_copy(struct zonefs_inode_info *src_zi, loff_t src_off,
+				struct zonefs_inode_info *dst_zi, loff_t dst_off, size_t len)
+{
+	struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
+	struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
+	struct range_entry *rlist;
+	int ret = -EIO;
+
+	rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
+	rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
+	rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
+	rlist[0].len = len;
+	rlist[0].comp_len = 0;
+	ret = blkdev_issue_copy(src_bdev, 1, rlist, dst_bdev, GFP_KERNEL);
+	if (ret) {
+		if (rlist[0].comp_len != len) {
+			ret = rlist[0].comp_len;
+			kfree(rlist);
+			return ret;
+		}
+	}
+	kfree(rlist);
+	return len;
+}
+static ssize_t __zonefs_copy_file_range(struct file *src_file, loff_t src_off,
+				      struct file *dst_file, loff_t dst_off,
+				      size_t len, unsigned int flags)
+{
+	struct inode *src_inode = file_inode(src_file);
+	struct inode *dst_inode = file_inode(dst_file);
+	struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
+	struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
+	struct block_device *src_bdev = src_inode->i_sb->s_bdev;
+	struct block_device *dst_bdev = dst_inode->i_sb->s_bdev;
+	struct super_block *src_sb = src_inode->i_sb;
+	struct zonefs_sb_info *src_sbi = ZONEFS_SB(src_sb);
+	struct super_block *dst_sb = dst_inode->i_sb;
+	struct zonefs_sb_info *dst_sbi = ZONEFS_SB(dst_sb);
+	ssize_t ret = -EIO, bytes;
+
+	if (src_bdev != dst_bdev) {
+		zonefs_err(src_sb, "Copying files across two devices\n");
+			return -EXDEV;
+	}
+
+	/*
+	 * Some of the checks below will return -EOPNOTSUPP,
+	 * which will force a generic copy
+	 */
+
+	if (!(src_sbi->s_mount_opts & ZONEFS_MNTOPT_COPY_FILE)
+		|| !(dst_sbi->s_mount_opts & ZONEFS_MNTOPT_COPY_FILE))
+		return -EOPNOTSUPP;
+
+	/* Start by sync'ing the source and destination files ifor conv zones */
+	if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
+		ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
+		if (ret < 0) {
+			zonefs_err(src_sb, "failed to write source file (%zd)\n", ret);
+			goto out;
+		}
+	}
+	if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
+		ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
+		if (ret < 0) {
+			zonefs_err(dst_sb, "failed to write destination file (%zd)\n", ret);
+			goto out;
+		}
+	}
+	mutex_lock(&dst_zi->i_truncate_mutex);
+	if (len > dst_zi->i_max_size - dst_zi->i_wpoffset) {
+		/* Adjust length */
+		len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
+		if (len <= 0) {
+			mutex_unlock(&dst_zi->i_truncate_mutex);
+			return -EOPNOTSUPP;
+		}
+	}
+	if (dst_off != dst_zi->i_wpoffset) {
+		mutex_unlock(&dst_zi->i_truncate_mutex);
+		return -EOPNOTSUPP; /* copy not at zone write ptr */
+	}
+	mutex_lock(&src_zi->i_truncate_mutex);
+	ret = zonefs_is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
+	if (ret < 0) {
+		mutex_unlock(&src_zi->i_truncate_mutex);
+		mutex_unlock(&dst_zi->i_truncate_mutex);
+		goto out;
+	}
+	mutex_unlock(&src_zi->i_truncate_mutex);
+
+	/* Drop dst file cached pages for a conv zone*/
+	if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
+		ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+						    dst_off >> PAGE_SHIFT,
+						    (dst_off + len) >> PAGE_SHIFT);
+		if (ret < 0) {
+			zonefs_err(dst_sb, "Failed to invalidate inode pages (%zd)\n", ret);
+			ret = 0;
+		}
+	}
+	bytes = __zonefs_send_copy(src_zi, src_off, dst_zi, dst_off, len);
+	ret += bytes;
+
+	file_update_time(dst_file);
+	zonefs_update_stats(dst_inode, dst_off + bytes);
+	zonefs_i_size_write(dst_inode, dst_off + bytes);
+	dst_zi->i_wpoffset += bytes;
+	mutex_unlock(&dst_zi->i_truncate_mutex);
+
+
+
+	/*
+	 * if we still have some bytes left, do splice copy
+	 */
+	if (bytes && (bytes < len)) {
+		zonefs_info(src_sb, "Final partial copy of %zu bytes\n", len);
+		bytes = do_splice_direct(src_file, &src_off, dst_file,
+					 &dst_off, len, flags);
+		if (bytes > 0)
+			ret += bytes;
+		else
+			zonefs_info(src_sb, "Failed partial copy (%zd)\n", bytes);
+	}
+
+out:
+
+	return ret;
+}
+
+static ssize_t zonefs_copy_file_range(struct file *src_file, loff_t src_off,
+				    struct file *dst_file, loff_t dst_off,
+				    size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = __zonefs_copy_file_range(src_file, src_off, dst_file, dst_off,
+				     len, flags);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+					      dst_off, len, flags);
+	return ret;
+}
+
 static const struct file_operations zonefs_file_operations = {
 	.open		= zonefs_file_open,
 	.release	= zonefs_file_release,
@@ -1200,6 +1366,7 @@ static const struct file_operations zonefs_file_operations = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.iopoll		= iocb_bio_iopoll,
+	.copy_file_range = zonefs_copy_file_range,
 };
 
 static struct kmem_cache *zonefs_inode_cachep;
@@ -1262,7 +1429,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 enum {
 	Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
-	Opt_explicit_open, Opt_err,
+	Opt_explicit_open, Opt_no_copy_offload, Opt_err,
 };
 
 static const match_table_t tokens = {
@@ -1271,6 +1438,7 @@ static const match_table_t tokens = {
 	{ Opt_errors_zol,	"errors=zone-offline"},
 	{ Opt_errors_repair,	"errors=repair"},
 	{ Opt_explicit_open,	"explicit-open" },
+	{ Opt_no_copy_offload,	"no_copy_offload" },
 	{ Opt_err,		NULL}
 };
 
@@ -1280,6 +1448,7 @@ static int zonefs_parse_options(struct super_block *sb, char *options)
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
 
+	sbi->s_mount_opts |= ZONEFS_MNTOPT_COPY_FILE;
 	if (!options)
 		return 0;
 
@@ -1310,6 +1479,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options)
 		case Opt_explicit_open:
 			sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
 			break;
+		case Opt_no_copy_offload:
+			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_COPY_FILE;
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -1330,6 +1502,8 @@ static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",errors=zone-offline");
 	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR)
 		seq_puts(seq, ",errors=repair");
+	if (sbi->s_mount_opts & ZONEFS_MNTOPT_COPY_FILE)
+		seq_puts(seq, ",copy_offload");
 
 	return 0;
 }
@@ -1769,6 +1943,8 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	atomic_set(&sbi->s_active_seq_files, 0);
 	sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
 
+	/* set copy support by default */
+	sbi->s_mount_opts |= ZONEFS_MNTOPT_COPY_FILE;
 	ret = zonefs_read_super(sb);
 	if (ret)
 		return ret;
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 4b3de66c3233..efa6632c4b6a 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -162,6 +162,7 @@ enum zonefs_features {
 	(ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \
 	 ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR)
 #define ZONEFS_MNTOPT_EXPLICIT_OPEN	(1 << 4) /* Explicit open/close of zones on open/close */
+#define ZONEFS_MNTOPT_COPY_FILE		(1 << 5) /* enable copy file range offload to kernel */
 
 /*
  * In-memory Super block information.
-- 
2.35.1.500.gb896f729e2