[PATCH 11/14] xfs: emulate the btrfs dedupe extent same ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Emulate the BTRFS_IOC_EXTENT_SAME ioctl.  This operation is similar
to clone_range, but the kernel must confirm that the contents of the
two extents are identical before performing the reflink.

Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
---
 fs/xfs/libxfs/xfs_fs.h |   28 +++++++++++
 fs/xfs/xfs_ioctl.c     |  121 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_ioctl32.c   |    1 
 fs/xfs/xfs_reflink.c   |  109 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_reflink.h   |    6 ++
 5 files changed, 258 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 92f21e1..7f4d886 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -560,7 +560,7 @@ typedef struct xfs_swapext
 #define XFS_IOC_GOINGDOWN	     _IOR ('X', 125, __uint32_t)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
-/* reflink ioctls; these should match btrfs */
+/* reflink ioctls; these MUST match the btrfs ioctl definitions */
 struct xfs_ioctl_clone_range_args {
 	__s64 src_fd;
 	__u64 src_offset;
@@ -568,8 +568,34 @@ struct xfs_ioctl_clone_range_args {
 	__u64 dest_offset;
 };
 
+#define XFS_SAME_DATA_DIFFERS	1
+/* For extent-same ioctl */
+struct xfs_ioctl_file_extent_same_info {
+	__s64 fd;		/* in - destination file */
+	__u64 logical_offset;	/* in - start of extent in destination */
+	__u64 bytes_deduped;	/* out - total # of bytes we were able
+				 * to dedupe from this file */
+	/* status of this dedupe operation:
+	 * 0 if dedup succeeds
+	 * < 0 for error
+	 * == XFS_SAME_DATA_DIFFERS if data differs
+	 */
+	__s32 status;		/* out - see above description */
+	__u32 reserved;
+};
+
+struct xfs_ioctl_file_extent_same_args {
+	__u64 logical_offset;	/* in - start of extent in source */
+	__u64 length;		/* in - length of extent */
+	__u16 dest_count;	/* in - total elements in info array */
+	__u16 reserved1;
+	__u32 reserved2;
+	struct xfs_ioctl_file_extent_same_info info[0];
+};
+
 #define XFS_IOC_CLONE		 _IOW (0x94, 9, int)
 #define XFS_IOC_CLONE_RANGE	 _IOW (0x94, 13, struct xfs_ioctl_clone_range_args)
+#define XFS_IOC_FILE_EXTENT_SAME _IOWR(0x94, 54, struct xfs_ioctl_file_extent_same_args)
 
 #ifndef HAVE_BBMACROS
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c590786..da4d7b7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1534,7 +1534,8 @@ xfs_ioctl_reflink(
 	loff_t		pos_in,
 	struct file	*file_out,
 	loff_t		pos_out,
-	size_t		len)
+	size_t		len,
+	bool		is_dedupe)
 {
 	struct inode	*inode_in;
 	struct inode	*inode_out;
@@ -1543,6 +1544,7 @@ xfs_ioctl_reflink(
 	loff_t		isize;
 	int		same_inode;
 	loff_t		blen;
+	unsigned int	flags;
 
 	if (len == 0)
 		return 0;
@@ -1622,7 +1624,12 @@ xfs_ioctl_reflink(
 	if (ret)
 		goto out_unlock;
 
-	ret = xfs_reflink(XFS_I(inode_in), pos_in, XFS_I(inode_out), pos_out, len);
+	flags = 0;
+	if (is_dedupe)
+		flags |= XFS_REFLINK_DEDUPE;
+
+	ret = xfs_reflink(XFS_I(inode_in), pos_in, XFS_I(inode_out), pos_out,
+			len, flags);
 	if (ret < 0)
 		goto out_unlock;
 
@@ -1644,6 +1651,108 @@ out_unlock:
 	return ret;
 }
 
+#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+
+static long
+xfs_ioctl_file_extent_same(
+	struct file					*file,
+	struct xfs_ioctl_file_extent_same_args __user	*argp)
+{
+	struct xfs_ioctl_file_extent_same_args	*same;
+	struct xfs_ioctl_file_extent_same_info	*info;
+	struct inode 				*src;
+	u64					off;
+	u64					len;
+	int					i;
+	int					ret;
+	unsigned long				size;
+	bool					is_admin;
+	u16					count;
+
+	is_admin = capable(CAP_SYS_ADMIN);
+	src = file_inode(file);
+	if (!(file->f_mode & FMODE_READ))
+		return -EINVAL;
+
+	if (get_user(count, &argp->dest_count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	size = offsetof(struct xfs_ioctl_file_extent_same_args __user,
+			info[count]);
+
+	same = memdup_user(argp, size);
+
+	if (IS_ERR(same)) {
+		ret = PTR_ERR(same);
+		goto out;
+	}
+
+	off = same->logical_offset;
+	len = same->length;
+
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > XFS_MAX_DEDUPE_LEN)
+		len = XFS_MAX_DEDUPE_LEN;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode))
+		goto out;
+
+	ret = -EACCES;
+	if (!S_ISREG(src->i_mode))
+		goto out;
+
+	/* pre-format output fields to sane values */
+	for (i = 0; i < count; i++) {
+		same->info[i].bytes_deduped = 0ULL;
+		same->info[i].status = 0;
+	}
+
+	for (i = 0, info = same->info; i < count; i++, info++) {
+		struct inode *dst;
+		struct fd dst_file = fdget(info->fd);
+		if (!dst_file.file) {
+			info->status = -EBADF;
+			continue;
+		}
+		dst = file_inode(dst_file.file);
+
+		info->bytes_deduped = 0;
+		if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
+			info->status = -EINVAL;
+		} else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+			info->status = -EXDEV;
+		} else if (S_ISDIR(dst->i_mode)) {
+			info->status = -EISDIR;
+		} else if (!S_ISREG(dst->i_mode)) {
+			info->status = -EACCES;
+		} else {
+			info->status = xfs_ioctl_reflink(file, off,
+							 dst_file.file,
+							 info->logical_offset,
+							 len, true);
+			if (info->status == -EBADE)
+				info->status = XFS_SAME_DATA_DIFFERS;
+			else if (info->status == 0)
+				info->bytes_deduped = len;
+		}
+		fdput(dst_file);
+	}
+
+	ret = copy_to_user(argp, same, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	return ret;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -1949,7 +2058,7 @@ xfs_file_ioctl(
 		if (!src.file)
 			return -EBADF;
 
-		error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL);
+		error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL, false);
 		fdput(src);
 		if (error > 0)
 			error = 0;
@@ -1970,7 +2079,8 @@ xfs_file_ioctl(
 			args.src_length = ~0ULL;
 
 		error = xfs_ioctl_reflink(src.file, args.src_offset, filp,
-					  args.dest_offset, args.src_length);
+					  args.dest_offset, args.src_length,
+					  false);
 		fdput(src);
 		if (error > 0)
 			error = 0;
@@ -1978,6 +2088,9 @@ xfs_file_ioctl(
 		return error;
 	}
 
+	case XFS_IOC_FILE_EXTENT_SAME:
+		return xfs_ioctl_file_extent_same(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 76d8729..575c292 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -560,6 +560,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_ERROR_CLEARALL:
 	case XFS_IOC_CLONE:
 	case XFS_IOC_CLONE_RANGE:
+	case XFS_IOC_FILE_EXTENT_SAME:
 		return xfs_file_ioctl(filp, cmd, p);
 #ifndef BROKEN_X86_ALIGNMENT
 	/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4f027d3..325dd14 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -71,6 +71,94 @@
 				(len) <= (mp)->m_sb.sb_agblocks, label); \
 	} while(0);
 
+/*
+ * Read a page's worth of file data into the page cache.
+ */
+static struct page *
+xfs_get_page(
+	struct inode	*inode,		/* inode */
+	xfs_off_t 	offset)		/* where in the inode to read */
+{
+	struct address_space	*mapping;
+	struct page		*page;
+	pgoff_t			n;
+
+	n = offset >> PAGE_CACHE_SHIFT;
+	mapping = inode->i_mapping;
+	page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		page_cache_release(page);
+		return NULL;
+	}
+	return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int
+xfs_compare_extents(
+	struct inode	*src,		/* first inode */
+	xfs_off_t	srcoff,		/* offset of first inode */
+	struct inode	*dest,		/* second inode */
+	xfs_off_t	destoff,	/* offset of second inode */
+	xfs_off_t	len,		/* length of data to compare */
+	bool		*is_same)	/* out: true if the contents match */
+{
+	xfs_off_t	src_poff;
+	xfs_off_t	dest_poff;
+	void		*src_addr;
+	void		*dest_addr;
+	struct page	*src_page;
+	struct page	*dest_page;
+	xfs_off_t	cmp_len;
+	bool		same;
+
+	same = true;
+	while (len) {
+		src_poff = srcoff & (PAGE_CACHE_SIZE - 1);
+		dest_poff = destoff & (PAGE_CACHE_SIZE - 1);
+		cmp_len = min(PAGE_CACHE_SIZE - src_poff,
+			      PAGE_CACHE_SIZE - dest_poff);
+		cmp_len = min(cmp_len, len);
+		ASSERT(cmp_len > 0);
+
+		src_page = xfs_get_page(src, srcoff);
+		if (!src_page)
+			return -EINVAL;
+		dest_page = xfs_get_page(dest, destoff);
+		if (!dest_page) {
+			page_cache_release(src_page);
+			return -EINVAL;
+		}
+		src_addr = kmap_atomic(src_page);
+		dest_addr = kmap_atomic(dest_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dest_page);
+
+		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+			same = false;
+
+		kunmap_atomic(src_addr);
+		kunmap_atomic(dest_addr);
+		page_cache_release(src_page);
+		page_cache_release(dest_page);
+
+		if (!same)
+			break;
+
+		srcoff += cmp_len;
+		destoff += cmp_len;
+		len -= cmp_len;
+	}
+
+	*is_same = same;
+	return 0;
+}
+
 /**
  * xfs_reflink() - link a range of blocks from one inode to another
  *
@@ -86,7 +174,8 @@ xfs_reflink(
 	xfs_off_t		srcoff, /* offset in source file */
 	struct xfs_inode	*dest,	/* XFS inode to copy extents to */
 	xfs_off_t		destoff,/* offset in destination file */
-	xfs_off_t		len)	/* number of bytes to copy */
+	xfs_off_t		len,	/* number of bytes to copy */
+	unsigned int		flags)	/* reflink flags */
 {
 	struct xfs_mount	*mp = src->i_mount;
 	loff_t			uninitialized_var(offset);
@@ -105,6 +194,7 @@ xfs_reflink(
 	xfs_agnumber_t		agno;		/* allocation group number */
 	xfs_agblock_t		agbno;
 	int			done;
+	bool			is_same;
 	xfs_off_t		blen = ALIGN(len, mp->m_sb.sb_blocksize);
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -117,6 +207,9 @@ xfs_reflink(
 	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
 		return -EINVAL;
 
+	if (flags & ~XFS_REFLINK_ALL)
+		return -EINVAL;
+
 	/* Lock both files against IO */
 	if (src->i_ino == dest->i_ino) {
 		xfs_ilock(src, XFS_IOLOCK_EXCL);
@@ -127,6 +220,20 @@ xfs_reflink(
 	}
 
 	/*
+	 * Check that the extents are the same.
+	 */
+	if (flags & XFS_REFLINK_DEDUPE) {
+		error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
+				destoff, len, &is_same);
+		if (error)
+			goto out_unlock_io;
+		if (!is_same) {
+			error = -EBADE;
+			goto out_unlock_io;
+		}
+	}
+
+	/*
 	 * Ensure the reflink bit is set in both inodes.
 	 */
 	if (!(src->i_d.di_flags & XFS_DIFLAG_REFLINK) ||
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index adfd99c..7f9660d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -24,8 +24,12 @@ typedef struct xfs_reflink_end_io {
 	struct xfs_efi_log_item	*rlei_efi;
 } xfs_reflink_end_io_t;
 
+#define XFS_REFLINK_DEDUPE	1	/* only reflink if contents match */
+#define XFS_REFLINK_ALL		(XFS_REFLINK_DEDUPE)
+
 extern int xfs_reflink(struct xfs_inode *src, xfs_off_t srcoff,
-	struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len);
+	struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
+	unsigned int flags);
 
 extern int xfs_reflink_get_refcount(struct xfs_mount *mp, xfs_agnumber_t agno,
 	xfs_agblock_t agbno, xfs_extlen_t *len, xfs_nlink_t *nr);

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs



[Index of Archives]     [Linux XFS Devel]     [Linux Filesystem Development]     [Filesystem Testing]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux