From: Darrick J. Wong <djwong@xxxxxxxxxx> This patch introduces two more new ioctls to manage atomic updates to file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE. The commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE does, but with the additional requirement that file2 cannot have changed since some sampling point. The start-commit ioctl performs the sampling of file attributes. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- fs/xfs/libxfs/xfs_fs.h | 26 ++++++++++++++ fs/xfs/xfs_exchrange.c | 54 +++++++++++++++++++++++++++++ fs/xfs/xfs_exchrange.h | 7 +++- fs/xfs/xfs_ioctl.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 3 +- 5 files changed, 175 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ea07fb7b89722..dffc7322c48d1 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -790,6 +790,30 @@ struct xfs_exchange_range { __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ }; +/* + * Using the same definition of file2 as struct xfs_exchange_range, commit the + * contents of file1 into file2 if file2 has the same inode number, mtime, and + * ctime as the arguments provided to the call. The old contents of file2 will + * be moved to file1. + * + * Returns -EBUSY if there isn't an exact match for the file2 fields. + * + * Filesystems must be able to restart and complete the operation even after + * the system goes down. + */ +struct xfs_commit_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ + + /* opaque file2 metadata for freshness checks */ + __u64 file2_freshness[5]; +}; + /* * Exchange file data all the way to the ends of both files, and then exchange * the file sizes. This flag can be used to replace a file's contents with a @@ -887,6 +911,8 @@ struct xfs_exchange_range { #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) #define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range) +#define XFS_IOC_START_COMMIT _IOWR('X', 130, struct xfs_commit_range) +#define XFS_IOC_COMMIT_RANGE _IOWR('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index 2d33c7de04f4c..73eae27a23016 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -131,6 +131,33 @@ xfs_exchrange_estimate( return error; } +/* + * Check that file2's metadata agree with the snapshot that we took for the + * range commit request. + * + * This should be called after the filesystem has locked /all/ inode metadata + * against modification. + */ +STATIC int +xfs_exchrange_check_freshness( + const struct xfs_exchrange *fxr, + struct xfs_inode *ip2) +{ + struct inode *inode2 = VFS_I(ip2); + struct timespec64 ctime = inode_get_ctime(inode2); + struct timespec64 mtime = inode_get_mtime(inode2); + + trace_xfs_exchrange_freshness(fxr, ip2); + + /* Check that file2 hasn't otherwise been modified. */ + if (fxr->file2_ino != ip2->i_ino || + !timespec64_equal(&fxr->file2_ctime, &ctime) || + !timespec64_equal(&fxr->file2_mtime, &mtime)) + return -EBUSY; + + return 0; +} + #define QRETRY_IP1 (0x1) #define QRETRY_IP2 (0x2) @@ -666,6 +693,12 @@ xfs_exchrange_prep( if (error || fxr->length == 0) return error; + if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) { + error = xfs_exchrange_check_freshness(fxr, ip2); + if (error) + return error; + } + /* Attach dquots to both inodes before changing block maps. */ error = xfs_qm_dqattach(ip2); if (error) @@ -780,7 +813,8 @@ xfs_exchange_range( if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) return -EXDEV; - if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | + __XFS_EXCHANGE_RANGE_CHECK_FRESH2)) return -EINVAL; /* Userspace requests only honored for regular files. */ @@ -828,3 +862,21 @@ xfs_exchange_range( fsnotify_modify(fxr->file2); return 0; } + +/* Sample freshness data from fxr->file2 for a commit range operation. */ +void +xfs_exchrange_freshness( + struct xfs_exchrange *fxr) +{ + struct inode *inode2 = file_inode(fxr->file2); + struct xfs_inode *ip2 = XFS_I(inode2); + unsigned int lockflags = XFS_IOLOCK_SHARED | + XFS_MMAPLOCK_SHARED | + XFS_ILOCK_SHARED; + + xfs_ilock(ip2, lockflags); + fxr->file2_ino = ip2->i_ino; + fxr->file2_ctime = inode_get_ctime(inode2); + fxr->file2_mtime = inode_get_mtime(inode2); + xfs_iunlock(ip2, lockflags); +} diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h index 804bb92cb5c29..80f8aa5239ef1 100644 --- a/fs/xfs/xfs_exchrange.h +++ b/fs/xfs/xfs_exchrange.h @@ -13,8 +13,12 @@ int xfs_exchrange_enable(struct xfs_mount *mp); #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63) #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62) +/* Freshness check required */ +#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2 (1ULL << 61) + #define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \ - __XFS_EXCHANGE_RANGE_UPD_CMTIME2) + __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \ + __XFS_EXCHANGE_RANGE_CHECK_FRESH2) struct xfs_exchrange { struct file *file1; @@ -32,6 +36,7 @@ struct xfs_exchrange { struct timespec64 file2_ctime; }; +void xfs_exchrange_freshness(struct xfs_exchrange *fxr); int xfs_exchange_range(struct xfs_exchrange *fxr); struct xfs_exchmaps_req; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index b11077b12f5fd..d2fc710d2d506 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1965,6 +1965,90 @@ xfs_ioc_exchange_range( return error; } +/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ +struct xfs_commit_range_fresh { + __u64 file2_ino; /* inode number */ + __s64 file2_mtime; /* modification time */ + __s64 file2_ctime; /* change time */ + __s32 file2_mtime_nsec; /* mod time, nsec */ + __s32 file2_ctime_nsec; /* change time, nsec */ + __u64 pad; /* zero */ +}; + +static long +xfs_ioc_start_commit( + struct file *file, + struct xfs_commit_range __user *argp) +{ + struct xfs_exchrange fxr = { + .file2 = file, + }; + struct xfs_commit_range args; + struct xfs_commit_range_fresh *kern_f; + struct xfs_commit_range_fresh __user *user_f; + + BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) != + sizeof(args.file2_freshness)); + + xfs_exchrange_freshness(&fxr); + + kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; + memset(kern_f, 0, sizeof(*kern_f)); + kern_f->file2_ino = fxr.file2_ino; + kern_f->file2_mtime = fxr.file2_mtime.tv_sec; + kern_f->file2_mtime_nsec = fxr.file2_mtime.tv_nsec; + kern_f->file2_ctime = fxr.file2_ctime.tv_sec; + kern_f->file2_ctime_nsec = fxr.file2_ctime.tv_nsec; + + user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness; + if (copy_to_user(user_f, kern_f, sizeof(*kern_f))) + return -EFAULT; + + return 0; +} + +static long +xfs_ioc_commit_range( + struct file *file, + struct xfs_commit_range __user *argp) +{ + struct xfs_exchrange fxr = { + .file2 = file, + }; + struct xfs_commit_range args; + struct xfs_commit_range_fresh *kern_f; + struct fd file1; + int error; + + kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (memchr_inv(&kern_f->pad, 0, sizeof(kern_f->pad))) + return -EINVAL; + if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + return -EINVAL; + + fxr.file1_offset = args.file1_offset; + fxr.file2_offset = args.file2_offset; + fxr.length = args.length; + fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2; + fxr.file2_ino = kern_f->file2_ino; + fxr.file2_mtime.tv_sec = kern_f->file2_mtime; + fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec; + fxr.file2_ctime.tv_sec = kern_f->file2_ctime; + fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; + + file1 = fdget(args.file1_fd); + if (!file1.file) + return -EBADF; + fxr.file1 = file1.file; + + error = xfs_exchange_range(&fxr); + fdput(file1); + return error; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2207,6 +2291,10 @@ xfs_file_ioctl( case XFS_IOC_EXCHANGE_RANGE: return xfs_ioc_exchange_range(filp, arg); + case XFS_IOC_START_COMMIT: + return xfs_ioc_start_commit(filp, arg); + case XFS_IOC_COMMIT_RANGE: + return xfs_ioc_commit_range(filp, arg); default: return -ENOTTY; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d688b9e5e08a6..39f7784823cbf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4835,7 +4835,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error); { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \ { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \ { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \ - { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" } + { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }, \ + { __XFS_EXCHANGE_RANGE_CHECK_FRESH2, "FRESH2" } /* file exchange-range tracepoint class */ DECLARE_EVENT_CLASS(xfs_exchrange_class,