On Tue, Sep 03, 2024 at 09:52:43AM +0200, Christian Brauner wrote: > On Mon, Sep 02, 2024 at 11:23:07AM GMT, Darrick J. Wong wrote: > > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > > > This patch introduces two more new ioctls to manage atomic updates to > > file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE. The > > commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE > > does, but with the additional requirement that file2 cannot have changed > > since some sampling point. The start-commit ioctl performs the sampling > > of file attributes. > > > > Note: This patch currently samples i_ctime during START_COMMIT and > > checks that it hasn't changed during COMMIT_RANGE. This isn't entirely > > safe in kernels prior to 6.12 because ctime only had coarse grained > > granularity and very fast updates could collide with a COMMIT_RANGE. > > With the multi-granularity ctime introduced by Jeff Layton, it's now > > possible to update ctime such that this does not happen. > > > > It is critical, then, that this patch must not be backported to any > > kernel that does not support fine-grained file change timestamps. > > > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > > Acked-by: Jeff Layton <jlayton@xxxxxxxxxx> > > Reviewed-by: Christoph Hellwig <hch@xxxxxx> > > --- > > fs/xfs/libxfs/xfs_fs.h | 26 +++++++++ > > fs/xfs/xfs_exchrange.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++ > > fs/xfs/xfs_exchrange.h | 16 +++++ > > fs/xfs/xfs_ioctl.c | 4 + > > fs/xfs/xfs_trace.h | 57 +++++++++++++++++++ > > 5 files changed, 243 insertions(+), 3 deletions(-) > > > > > > diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h > > index 454b63ef7201..c85c8077fac3 100644 > > --- a/fs/xfs/libxfs/xfs_fs.h > > +++ b/fs/xfs/libxfs/xfs_fs.h > > @@ -825,6 +825,30 @@ struct xfs_exchange_range { > > __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ > > }; > > > > +/* > > + * Using the same definition of file2 as struct xfs_exchange_range, commit the > > + * contents of file1 into file2 if file2 has the same inode number, mtime, and > > + * ctime as the arguments provided to the call. The old contents of file2 will > > + * be moved to file1. > > + * > > + * Returns -EBUSY if there isn't an exact match for the file2 fields. > > + * > > + * Filesystems must be able to restart and complete the operation even after > > + * the system goes down. > > + */ > > +struct xfs_commit_range { > > + __s32 file1_fd; > > + __u32 pad; /* must be zeroes */ > > + __u64 file1_offset; /* file1 offset, bytes */ > > + __u64 file2_offset; /* file2 offset, bytes */ > > + __u64 length; /* bytes to exchange */ > > + > > + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ > > + > > + /* opaque file2 metadata for freshness checks */ > > + __u64 file2_freshness[6]; > > +}; > > + > > /* > > * Exchange file data all the way to the ends of both files, and then exchange > > * the file sizes. This flag can be used to replace a file's contents with a > > @@ -997,6 +1021,8 @@ struct xfs_getparents_by_handle { > > #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) > > #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) > > #define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range) > > +#define XFS_IOC_START_COMMIT _IOR ('X', 130, struct xfs_commit_range) > > +#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) > > /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ > > > > > > diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c > > index c8a655c92c92..d0889190ab7f 100644 > > --- a/fs/xfs/xfs_exchrange.c > > +++ b/fs/xfs/xfs_exchrange.c > > @@ -72,6 +72,34 @@ xfs_exchrange_estimate( > > return error; > > } > > > > +/* > > + * Check that file2's metadata agree with the snapshot that we took for the > > + * range commit request. > > + * > > + * This should be called after the filesystem has locked /all/ inode metadata > > + * against modification. > > + */ > > +STATIC int > > +xfs_exchrange_check_freshness( > > + const struct xfs_exchrange *fxr, > > + struct xfs_inode *ip2) > > +{ > > + struct inode *inode2 = VFS_I(ip2); > > + struct timespec64 ctime = inode_get_ctime(inode2); > > + struct timespec64 mtime = inode_get_mtime(inode2); > > + > > + trace_xfs_exchrange_freshness(fxr, ip2); > > + > > + /* Check that file2 hasn't otherwise been modified. */ > > + if (fxr->file2_ino != ip2->i_ino || > > + fxr->file2_gen != inode2->i_generation || > > + !timespec64_equal(&fxr->file2_ctime, &ctime) || > > + !timespec64_equal(&fxr->file2_mtime, &mtime)) > > + return -EBUSY; > > + > > + return 0; > > +} > > + > > #define QRETRY_IP1 (0x1) > > #define QRETRY_IP2 (0x2) > > > > @@ -607,6 +635,12 @@ xfs_exchrange_prep( > > if (error || fxr->length == 0) > > return error; > > > > + if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) { > > + error = xfs_exchrange_check_freshness(fxr, ip2); > > + if (error) > > + return error; > > + } > > + > > /* Attach dquots to both inodes before changing block maps. */ > > error = xfs_qm_dqattach(ip2); > > if (error) > > @@ -719,7 +753,8 @@ xfs_exchange_range( > > if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) > > return -EXDEV; > > > > - if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) > > + if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | > > + __XFS_EXCHANGE_RANGE_CHECK_FRESH2)) > > return -EINVAL; > > > > /* Userspace requests only honored for regular files. */ > > @@ -802,3 +837,109 @@ xfs_ioc_exchange_range( > > fdput(file1); > > return error; > > } > > + > > +/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ > > +struct xfs_commit_range_fresh { > > + xfs_fsid_t fsid; /* m_fixedfsid */ > > + __u64 file2_ino; /* inode number */ > > + __s64 file2_mtime; /* modification time */ > > + __s64 file2_ctime; /* change time */ > > + __s32 file2_mtime_nsec; /* mod time, nsec */ > > + __s32 file2_ctime_nsec; /* change time, nsec */ > > + __u32 file2_gen; /* inode generation */ > > + __u32 magic; /* zero */ > > +}; > > +#define XCR_FRESH_MAGIC 0x444F524B /* DORK */ > > + > > +/* Set up a commitrange operation by sampling file2's write-related attrs */ > > +long > > +xfs_ioc_start_commit( > > + struct file *file, > > + struct xfs_commit_range __user *argp) > > +{ > > + struct xfs_commit_range args = { }; > > + struct timespec64 ts; > > + struct xfs_commit_range_fresh *kern_f; > > + struct xfs_commit_range_fresh __user *user_f; > > + struct inode *inode2 = file_inode(file); > > + struct xfs_inode *ip2 = XFS_I(inode2); > > + const unsigned int lockflags = XFS_IOLOCK_SHARED | > > + XFS_MMAPLOCK_SHARED | > > + XFS_ILOCK_SHARED; > > + > > + BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) != > > + sizeof(args.file2_freshness)); > > + > > + kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; > > + > > + memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); > > + > > + xfs_ilock(ip2, lockflags); > > + ts = inode_get_ctime(inode2); > > + kern_f->file2_ctime = ts.tv_sec; > > + kern_f->file2_ctime_nsec = ts.tv_nsec; > > + ts = inode_get_mtime(inode2); > > + kern_f->file2_mtime = ts.tv_sec; > > + kern_f->file2_mtime_nsec = ts.tv_nsec; > > + kern_f->file2_ino = ip2->i_ino; > > + kern_f->file2_gen = inode2->i_generation; > > + kern_f->magic = XCR_FRESH_MAGIC; > > + xfs_iunlock(ip2, lockflags); > > + > > + user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness; > > + if (copy_to_user(user_f, kern_f, sizeof(*kern_f))) > > + return -EFAULT; > > + > > + return 0; > > +} > > + > > +/* > > + * Exchange file1 and file2 contents if file2 has not been written since the > > + * start commit operation. > > + */ > > +long > > +xfs_ioc_commit_range( > > + struct file *file, > > + struct xfs_commit_range __user *argp) > > +{ > > + struct xfs_exchrange fxr = { > > + .file2 = file, > > + }; > > + struct xfs_commit_range args; > > + struct xfs_commit_range_fresh *kern_f; > > + struct xfs_inode *ip2 = XFS_I(file_inode(file)); > > + struct xfs_mount *mp = ip2->i_mount; > > + struct fd file1; > > + int error; > > + > > + kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; > > + > > + if (copy_from_user(&args, argp, sizeof(args))) > > + return -EFAULT; > > + if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) > > + return -EINVAL; > > + if (kern_f->magic != XCR_FRESH_MAGIC) > > + return -EBUSY; > > + if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t))) > > + return -EBUSY; > > So, I mentioned this before in another mail a few months ago and I think > you liked the idea so just as a reminder in case you forgot: > > Ioctls are extensible if done correctly: > > switch (__IOC_NR(ioctl)) { > case _IOC_NR(XFS_IOC_START_COMMIT): { > size_t usize = _IOC_SIZE(ioctl); > struct xfs_commit_range args; > > if (usize < XFS_IOC_START_COMMIT_SIZE_VER0) > return -EINVAL; > > if (copy_struct_from_user(&args, sizeof(args), argp, usize)) > return -EFAULT; > } > > If you code it this way, relying on copy_struct_from_user() right from > the start you can easily extend your struct in a backward and forward > compatible manner. I don't know that we'd really need it for commitrange since there's plenty of space (~40 bytes) in the "opaque" freshness blob. I suspect that if we ever add subvolumes to XFS then we might want to take over the 12 bytes used by mtime* for the subvolume id. That said, I also think we could convert to this format pretty easily. Also, copy_struct_from_user can return -E2BIG so I think that needs to be: ret = copy_struct_from_user(&args, sizeof(args), argp, usize); if (ret) return ret; Though the overriding reason for not writing the __IOC_NR dispatch code this way is that every time I've tried extend an xfs ioctl in this manner, Dave says no because (I think) he doesn't trust how the struct size is opaquely encoded in the ioctl number /and/ doesn't trust the BUILD_BUG_ONs I put in the code to guarantee uniqueness so I pick a new number so I can complete the review instead of starting over with a different reviewer who doesn't have that particular preference. > } > > > + > > + fxr.file1_offset = args.file1_offset; > > + fxr.file2_offset = args.file2_offset; > > + fxr.length = args.length; > > + fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2; > > + fxr.file2_ino = kern_f->file2_ino; > > + fxr.file2_gen = kern_f->file2_gen; > > + fxr.file2_mtime.tv_sec = kern_f->file2_mtime; > > + fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec; > > + fxr.file2_ctime.tv_sec = kern_f->file2_ctime; > > + fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; > > + > > + file1 = fdget(args.file1_fd); > > + if (!file1.file) > > + return -EBADF; > > Please use CLASS(fd, f)(args.file1_fd) :) Yeah, I saw that the fd cleanups collided with xfs in for-next, thanks for the heads up. --D > > + fxr.file1 = file1.file; > > + > > + error = xfs_exchange_range(&fxr); > > + fdput(file1); > > + return error; > > +} > > diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h > > index 039abcca546e..bc1298aba806 100644 > > --- a/fs/xfs/xfs_exchrange.h > > +++ b/fs/xfs/xfs_exchrange.h > > @@ -10,8 +10,12 @@ > > #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63) > > #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62) > > > > +/* Freshness check required */ > > +#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2 (1ULL << 61) > > + > > #define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \ > > - __XFS_EXCHANGE_RANGE_UPD_CMTIME2) > > + __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \ > > + __XFS_EXCHANGE_RANGE_CHECK_FRESH2) > > > > struct xfs_exchrange { > > struct file *file1; > > @@ -22,10 +26,20 @@ struct xfs_exchrange { > > u64 length; > > > > u64 flags; /* XFS_EXCHANGE_RANGE flags */ > > + > > + /* file2 metadata for freshness checks */ > > + u64 file2_ino; > > + struct timespec64 file2_mtime; > > + struct timespec64 file2_ctime; > > + u32 file2_gen; > > }; > > > > long xfs_ioc_exchange_range(struct file *file, > > struct xfs_exchange_range __user *argp); > > +long xfs_ioc_start_commit(struct file *file, > > + struct xfs_commit_range __user *argp); > > +long xfs_ioc_commit_range(struct file *file, > > + struct xfs_commit_range __user *argp); > > > > struct xfs_exchmaps_req; > > > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > > index 6b13666d4e96..90b3ee21e7fe 100644 > > --- a/fs/xfs/xfs_ioctl.c > > +++ b/fs/xfs/xfs_ioctl.c > > @@ -1518,6 +1518,10 @@ xfs_file_ioctl( > > > > case XFS_IOC_EXCHANGE_RANGE: > > return xfs_ioc_exchange_range(filp, arg); > > + case XFS_IOC_START_COMMIT: > > + return xfs_ioc_start_commit(filp, arg); > > + case XFS_IOC_COMMIT_RANGE: > > + return xfs_ioc_commit_range(filp, arg); > > > > default: > > return -ENOTTY; > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > index 180ce697305a..4cf0fa71ba9c 100644 > > --- a/fs/xfs/xfs_trace.h > > +++ b/fs/xfs/xfs_trace.h > > @@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error); > > { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \ > > { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \ > > { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \ > > - { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" } > > + { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }, \ > > + { __XFS_EXCHANGE_RANGE_CHECK_FRESH2, "FRESH2" } > > > > /* file exchange-range tracepoint class */ > > DECLARE_EVENT_CLASS(xfs_exchrange_class, > > @@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep); > > DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush); > > DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings); > > > > +TRACE_EVENT(xfs_exchrange_freshness, > > + TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2), > > + TP_ARGS(fxr, ip2), > > + TP_STRUCT__entry( > > + __field(dev_t, dev) > > + __field(xfs_ino_t, ip2_ino) > > + __field(long long, ip2_mtime) > > + __field(long long, ip2_ctime) > > + __field(int, ip2_mtime_nsec) > > + __field(int, ip2_ctime_nsec) > > + > > + __field(xfs_ino_t, file2_ino) > > + __field(long long, file2_mtime) > > + __field(long long, file2_ctime) > > + __field(int, file2_mtime_nsec) > > + __field(int, file2_ctime_nsec) > > + ), > > + TP_fast_assign( > > + struct timespec64 ts64; > > + struct inode *inode2 = VFS_I(ip2); > > + > > + __entry->dev = inode2->i_sb->s_dev; > > + __entry->ip2_ino = ip2->i_ino; > > + > > + ts64 = inode_get_ctime(inode2); > > + __entry->ip2_ctime = ts64.tv_sec; > > + __entry->ip2_ctime_nsec = ts64.tv_nsec; > > + > > + ts64 = inode_get_mtime(inode2); > > + __entry->ip2_mtime = ts64.tv_sec; > > + __entry->ip2_mtime_nsec = ts64.tv_nsec; > > + > > + __entry->file2_ino = fxr->file2_ino; > > + __entry->file2_mtime = fxr->file2_mtime.tv_sec; > > + __entry->file2_ctime = fxr->file2_ctime.tv_sec; > > + __entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec; > > + __entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec; > > + ), > > + TP_printk("dev %d:%d " > > + "ino 0x%llx mtime %lld:%d ctime %lld:%d -> " > > + "file 0x%llx mtime %lld:%d ctime %lld:%d", > > + MAJOR(__entry->dev), MINOR(__entry->dev), > > + __entry->ip2_ino, > > + __entry->ip2_mtime, > > + __entry->ip2_mtime_nsec, > > + __entry->ip2_ctime, > > + __entry->ip2_ctime_nsec, > > + __entry->file2_ino, > > + __entry->file2_mtime, > > + __entry->file2_mtime_nsec, > > + __entry->file2_ctime, > > + __entry->file2_ctime_nsec) > > +); > > + > > TRACE_EVENT(xfs_exchmaps_overhead, > > TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks, > > unsigned long long rmapbt_blocks), > > >