From: Darrick J. Wong <djwong@xxxxxxxxxx> Introduce a new ioctl to handle swapping ranges of bytes between files. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- fs/read_write.c | 2 fs/remap_range.c | 4 fs/xfs/Makefile | 1 fs/xfs/libxfs/xfs_fs.h | 1 fs/xfs/libxfs/xfs_fs_staging.h | 89 ++++++++++ fs/xfs/xfs_ioctl.c | 30 +++ fs/xfs/xfs_xchgrange.c | 343 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_xchgrange.h | 18 ++ include/linux/fs.h | 1 9 files changed, 487 insertions(+), 2 deletions(-) create mode 100644 fs/xfs/xfs_xchgrange.c create mode 100644 fs/xfs/xfs_xchgrange.h diff --git a/fs/read_write.c b/fs/read_write.c index a21ba3be7dbe..480e687a1587 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1650,6 +1650,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } +EXPORT_SYMBOL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) @@ -1718,3 +1719,4 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } +EXPORT_SYMBOL(generic_file_rw_checks); diff --git a/fs/remap_range.c b/fs/remap_range.c index 1331a890f2f2..ed1ee6576e03 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -98,8 +98,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, return 0; } -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) +int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { if (unlikely(pos < 0 || len < 0)) return -EINVAL; @@ -109,6 +108,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +EXPORT_SYMBOL(remap_verify_area); /* * Ensure that we don't remap a partial EOF block in the middle of something diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 56861c8f78cc..6cc3b1fe5754 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -93,6 +93,7 @@ xfs-y += xfs_aops.o \ xfs_sysfs.o \ xfs_trans.o \ xfs_xattr.o \ + xfs_xchgrange.o \ kmem.o # low-level transaction/log code diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 400cf68e551e..29857b0f87df 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -841,6 +841,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) +/* XFS_IOC_EXCHANGE_RANGE -------- staging 129 */ /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_fs_staging.h b/fs/xfs/libxfs/xfs_fs_staging.h index bc97193dde9d..0453e7f31af0 100644 --- a/fs/xfs/libxfs/xfs_fs_staging.h +++ b/fs/xfs/libxfs/xfs_fs_staging.h @@ -15,4 +15,93 @@ * explaining where it went. */ +/* + * Exchange part of file1 with part of the file that this ioctl that is being + * called against (which we'll call file2). Filesystems must be able to + * restart and complete the operation even after the system goes down. + */ +struct xfs_exch_range { + __s64 file1_fd; + __s64 file1_offset; /* file1 offset, bytes */ + __s64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCH_RANGE_* below */ + + /* file2 metadata for optional freshness checks */ + __s64 file2_ino; /* inode number */ + __s64 file2_mtime; /* modification time */ + __s64 file2_ctime; /* change time */ + __s32 file2_mtime_nsec; /* mod time, nsec */ + __s32 file2_ctime_nsec; /* change time, nsec */ + + __u64 pad[6]; /* must be zeroes */ +}; + +/* + * Atomic exchange operations are not required. This relaxes the requirement + * that the filesystem must be able to complete the operation after a crash. + */ +#define XFS_EXCH_RANGE_NONATOMIC (1 << 0) + +/* + * Check that file2's inode number, mtime, and ctime against the values + * provided, and return -EBUSY if there isn't an exact match. + */ +#define XFS_EXCH_RANGE_FILE2_FRESH (1 << 1) + +/* + * Check that the file1's length is equal to file1_offset + length, and that + * file2's length is equal to file2_offset + length. Returns -EDOM if there + * isn't an exact match. + */ +#define XFS_EXCH_RANGE_FULL_FILES (1 << 2) + +/* + * Exchange file data all the way to the ends of both files, and then exchange + * the file sizes. This flag can be used to replace a file's contents with a + * different amount of data. length will be ignored. + */ +#define XFS_EXCH_RANGE_TO_EOF (1 << 3) + +/* Flush all changes in file data and file metadata to disk before returning. */ +#define XFS_EXCH_RANGE_FSYNC (1 << 4) + +/* Dry run; do all the parameter verification but do not change anything. */ +#define XFS_EXCH_RANGE_DRY_RUN (1 << 5) + +/* + * Exchange only the parts of the two files where the file allocation units + * mapped to file1's range have been written to. This can accelerate + * scatter-gather atomic writes with a temp file if all writes are aligned to + * the file allocation unit. + */ +#define XFS_EXCH_RANGE_FILE1_WRITTEN (1 << 6) + +/* + * Commit the contents of file1 into file2 if file2 has the same inode number, + * mtime, and ctime as the arguments provided to the call. The old contents of + * file2 will be moved to file1. + * + * With this flag, all committed information can be retrieved even if the + * system crashes or is rebooted. This includes writing through or flushing a + * disk cache if present. The call blocks until the device reports that the + * commit is complete. + * + * This flag should not be combined with NONATOMIC. It can be combined with + * FILE1_WRITTEN. + */ +#define XFS_EXCH_RANGE_COMMIT (XFS_EXCH_RANGE_FILE2_FRESH | \ + XFS_EXCH_RANGE_FSYNC) + +#define XFS_EXCH_RANGE_ALL_FLAGS (XFS_EXCH_RANGE_NONATOMIC | \ + XFS_EXCH_RANGE_FILE2_FRESH | \ + XFS_EXCH_RANGE_FULL_FILES | \ + XFS_EXCH_RANGE_TO_EOF | \ + XFS_EXCH_RANGE_FSYNC | \ + XFS_EXCH_RANGE_DRY_RUN | \ + XFS_EXCH_RANGE_FILE1_WRITTEN) + +#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exch_range) + #endif /* __XFS_FS_STAGING_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0667e088a289..19724b3a5fdc 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -38,6 +38,7 @@ #include "xfs_reflink.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" +#include "xfs_xchgrange.h" #include <linux/mount.h> #include <linux/namei.h> @@ -1862,6 +1863,32 @@ xfs_fs_eofblocks_from_user( return 0; } +static long +xfs_ioc_exchange_range( + struct file *file2, + struct xfs_exch_range __user *argp) +{ + struct xfs_exch_range args; + struct fd file1; + int error; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + file1 = fdget(args.file1_fd); + if (!file1.file) + return -EBADF; + + error = -EXDEV; + if (file1.file->f_path.mnt != file2->f_path.mnt) + goto fdput; + + error = xfs_exch_range(file1.file, file2, &args); +fdput: + fdput(file1); + return error; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2150,6 +2177,9 @@ xfs_file_ioctl( return error; } + case XFS_IOC_EXCHANGE_RANGE: + return xfs_ioc_exchange_range(filp, arg); + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_xchgrange.c b/fs/xfs/xfs_xchgrange.c new file mode 100644 index 000000000000..b91df426d426 --- /dev/null +++ b/fs/xfs/xfs_xchgrange.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@xxxxxxxxxx> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_xchgrange.h" +#include <linux/fsnotify.h> + +/* + * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. + * This part does not deal with XFS-specific data structures, and may some day + * be ported to the VFS. + * + * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in + * file1 with the same number of bytes starting at fxr.file2_offset in file2. + * Implementations must call xfs_exch_range_prep to prepare the two files + * prior to taking locks; they must call xfs_exch_range_check_fresh once + * the inode is locked to abort the call if file2 has changed; and they must + * update the inode change and mod times of both files as part of the metadata + * update. The timestamp updates must be done atomically as part of the data + * exchange operation to ensure correctness of the freshness check. + */ + +/* + * Check that both files' metadata agree with the snapshot that we took for + * the range exchange request. + * + * This should be called after the filesystem has locked /all/ inode metadata + * against modification. + */ +STATIC int +xfs_exch_range_check_fresh( + struct inode *inode2, + const struct xfs_exch_range *fxr) +{ + /* Check that file2 hasn't otherwise been modified. */ + if ((fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) && + (fxr->file2_ino != inode2->i_ino || + fxr->file2_ctime != inode2->i_ctime.tv_sec || + fxr->file2_ctime_nsec != inode2->i_ctime.tv_nsec || + fxr->file2_mtime != inode2->i_mtime.tv_sec || + fxr->file2_mtime_nsec != inode2->i_mtime.tv_nsec)) + return -EBUSY; + + return 0; +} + +/* Performs necessary checks before doing a range exchange. */ +STATIC int +xfs_exch_range_checks( + struct file *file1, + struct file *file2, + struct xfs_exch_range *fxr, + unsigned int blocksize) +{ + struct inode *inode1 = file1->f_mapping->host; + struct inode *inode2 = file2->f_mapping->host; + uint64_t blkmask = blocksize - 1; + int64_t test_len; + uint64_t blen; + loff_t size1, size2; + int error; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) + return -EPERM; + if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) + return -ETXTBSY; + + size1 = i_size_read(inode1); + size2 = i_size_read(inode2); + + /* Ranges cannot start after EOF. */ + if (fxr->file1_offset > size1 || fxr->file2_offset > size2) + return -EINVAL; + + /* + * If the caller asked for full files, check that the offset/length + * values cover all of both files. + */ + if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) && + (fxr->file1_offset != 0 || fxr->file2_offset != 0 || + fxr->length != size1 || fxr->length != size2)) + return -EDOM; + + /* + * If the caller said to exchange to EOF, we set the length of the + * request large enough to cover everything to the end of both files. + */ + if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) + fxr->length = max_t(int64_t, size1 - fxr->file1_offset, + size2 - fxr->file2_offset); + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(fxr->file1_offset, blocksize) || + !IS_ALIGNED(fxr->file2_offset, blocksize)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (fxr->file1_offset + fxr->length < fxr->file1_offset || + fxr->file2_offset + fxr->length < fxr->file2_offset) + return -EINVAL; + + /* + * We require both ranges to be within EOF, unless we're exchanging + * to EOF. xfs_xchg_range_prep already checked that both + * fxr->file1_offset and fxr->file2_offset are within EOF. + */ + if (!(fxr->flags & XFS_EXCH_RANGE_TO_EOF) && + (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2)) + return -EINVAL; + + /* + * Make sure we don't hit any file size limits. If we hit any size + * limits such that test_length was adjusted, we abort the whole + * operation. + */ + test_len = fxr->length; + error = generic_write_check_limits(file2, fxr->file2_offset, &test_len); + if (error) + return error; + error = generic_write_check_limits(file1, fxr->file1_offset, &test_len); + if (error) + return error; + if (test_len != fxr->length) + return -EINVAL; + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next block boundary for this check. Do the same for the + * outfile. + * + * Otherwise, reject the range length if it's not block aligned. We + * already confirmed the starting offsets' block alignment. + */ + if (fxr->file1_offset + fxr->length == size1) + blen = ALIGN(size1, blocksize) - fxr->file1_offset; + else if (fxr->file2_offset + fxr->length == size2) + blen = ALIGN(size2, blocksize) - fxr->file2_offset; + else if (!IS_ALIGNED(fxr->length, blocksize)) + return -EINVAL; + else + blen = fxr->length; + + /* Don't allow overlapped exchanges within the same file. */ + if (inode1 == inode2 && + fxr->file2_offset + blen > fxr->file1_offset && + fxr->file1_offset + blen > fxr->file2_offset) + return -EINVAL; + + /* If we already failed the freshness check, we're done. */ + error = xfs_exch_range_check_fresh(inode2, fxr); + if (error) + return error; + + /* + * Ensure that we don't exchange a partial EOF block into the middle of + * another file. + */ + if ((fxr->length & blkmask) == 0) + return 0; + + blen = fxr->length; + if (fxr->file2_offset + blen < size2) + blen &= ~blkmask; + + if (fxr->file1_offset + blen < size1) + blen &= ~blkmask; + + return blen == fxr->length ? 0 : -EINVAL; +} + +/* + * Check that the two inodes are eligible for range exchanges, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the inodes + * have been locked against any other modifications. + */ +int +xfs_exch_range_prep( + struct file *file1, + struct file *file2, + struct xfs_exch_range *fxr, + unsigned int blocksize) +{ + struct inode *inode1 = file_inode(file1); + struct inode *inode2 = file_inode(file2); + bool same_inode = (inode1 == inode2); + int error; + + /* Check that we don't violate system file offset limits. */ + error = xfs_exch_range_checks(file1, file2, fxr, blocksize); + if (error || fxr->length == 0) + return error; + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode1); + if (!same_inode) + inode_dio_wait(inode2); + + error = filemap_write_and_wait_range(inode1->i_mapping, + fxr->file1_offset, + fxr->file1_offset + fxr->length - 1); + if (error) + return error; + + error = filemap_write_and_wait_range(inode2->i_mapping, + fxr->file2_offset, + fxr->file2_offset + fxr->length - 1); + if (error) + return error; + + /* + * If the files or inodes involved require synchronous writes, amend + * the request to force the filesystem to flush all data and metadata + * to disk after the operation completes. + */ + if (((file1->f_flags | file2->f_flags) & (__O_SYNC | O_DSYNC)) || + IS_SYNC(inode1) || IS_SYNC(inode2)) + fxr->flags |= XFS_EXCH_RANGE_FSYNC; + + return 0; +} + +/* + * Finish a range exchange operation, if it was successful. Caller must ensure + * that the inodes are still locked against any other modifications. + */ +int +xfs_exch_range_finish( + struct file *file1, + struct file *file2) +{ + int error; + + error = file_remove_privs(file1); + if (error) + return error; + if (file_inode(file1) == file_inode(file2)) + return 0; + + return file_remove_privs(file2); +} + +/* Decide if it's ok to remap the selected range of a given file. */ +STATIC int +xfs_exch_range_verify_area( + struct file *file, + loff_t pos, + struct xfs_exch_range *fxr) +{ + int64_t len = fxr->length; + + if (pos < 0) + return -EINVAL; + + if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) + len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos); + return remap_verify_area(file, pos, len, true); +} + +/* Prepare for and exchange parts of two files. */ +static inline int +__xfs_exch_range( + struct file *file1, + struct file *file2, + struct xfs_exch_range *fxr) +{ + struct inode *inode1 = file_inode(file1); + struct inode *inode2 = file_inode(file2); + int ret; + + if ((fxr->flags & ~XFS_EXCH_RANGE_ALL_FLAGS) || + memchr_inv(&fxr->pad, 0, sizeof(fxr->pad))) + return -EINVAL; + + if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) && + (fxr->flags & XFS_EXCH_RANGE_TO_EOF)) + return -EINVAL; + + /* + * The ioctl enforces that src and dest files are on the same mount. + * However, they only need to be on the same file system. + */ + if (inode1->i_sb != inode2->i_sb) + return -EXDEV; + + /* This only works for regular files. */ + if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) + return -EISDIR; + if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) + return -EINVAL; + + ret = generic_file_rw_checks(file1, file2); + if (ret < 0) + return ret; + + ret = generic_file_rw_checks(file2, file1); + if (ret < 0) + return ret; + + ret = xfs_exch_range_verify_area(file1, fxr->file1_offset, fxr); + if (ret) + return ret; + + ret = xfs_exch_range_verify_area(file2, fxr->file2_offset, fxr); + if (ret) + return ret; + + ret = -EOPNOTSUPP; /* XXX call out to xfs code */ + if (ret) + return ret; + + fsnotify_modify(file1); + if (file2 != file1) + fsnotify_modify(file2); + return 0; +} + +/* Exchange parts of two files. */ +int +xfs_exch_range( + struct file *file1, + struct file *file2, + struct xfs_exch_range *fxr) +{ + int error; + + file_start_write(file2); + error = __xfs_exch_range(file1, file2, fxr); + file_end_write(file2); + + return error; +} diff --git a/fs/xfs/xfs_xchgrange.h b/fs/xfs/xfs_xchgrange.h new file mode 100644 index 000000000000..414fce7a159f --- /dev/null +++ b/fs/xfs/xfs_xchgrange.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@xxxxxxxxxx> + */ +#ifndef __XFS_XCHGRANGE_H__ +#define __XFS_XCHGRANGE_H__ + +/* Prepare generic VFS data structures for file exchanges */ + +int xfs_exch_range_prep(struct file *file1, struct file *file2, + struct xfs_exch_range *fxr, unsigned int blocksize); +int xfs_exch_range_finish(struct file *file1, struct file *file2); + +int xfs_exch_range(struct file *file1, struct file *file2, + struct xfs_exch_range *fxr); + +#endif /* __XFS_XCHGRANGE_H__ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 147644b5d648..d7ee5122d40b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1884,6 +1884,7 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); +int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags,