Re: [PATCH 1/1] xfs: introduce new file range commit ioctls

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Sep 02, 2024 at 11:23:07AM GMT, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@xxxxxxxxxx>
> 
> This patch introduces two more new ioctls to manage atomic updates to
> file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE.  The
> commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE
> does, but with the additional requirement that file2 cannot have changed
> since some sampling point.  The start-commit ioctl performs the sampling
> of file attributes.
> 
> Note: This patch currently samples i_ctime during START_COMMIT and
> checks that it hasn't changed during COMMIT_RANGE.  This isn't entirely
> safe in kernels prior to 6.12 because ctime only had coarse grained
> granularity and very fast updates could collide with a COMMIT_RANGE.
> With the multi-granularity ctime introduced by Jeff Layton, it's now
> possible to update ctime such that this does not happen.
> 
> It is critical, then, that this patch must not be backported to any
> kernel that does not support fine-grained file change timestamps.
> 
> Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
> Acked-by: Jeff Layton <jlayton@xxxxxxxxxx>
> Reviewed-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  fs/xfs/libxfs/xfs_fs.h |   26 +++++++++
>  fs/xfs/xfs_exchrange.c |  143 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_exchrange.h |   16 +++++
>  fs/xfs/xfs_ioctl.c     |    4 +
>  fs/xfs/xfs_trace.h     |   57 +++++++++++++++++++
>  5 files changed, 243 insertions(+), 3 deletions(-)
> 
> 
> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
> index 454b63ef7201..c85c8077fac3 100644
> --- a/fs/xfs/libxfs/xfs_fs.h
> +++ b/fs/xfs/libxfs/xfs_fs.h
> @@ -825,6 +825,30 @@ struct xfs_exchange_range {
>  	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
>  };
>  
> +/*
> + * Using the same definition of file2 as struct xfs_exchange_range, commit the
> + * contents of file1 into file2 if file2 has the same inode number, mtime, and
> + * ctime as the arguments provided to the call.  The old contents of file2 will
> + * be moved to file1.
> + *
> + * Returns -EBUSY if there isn't an exact match for the file2 fields.
> + *
> + * Filesystems must be able to restart and complete the operation even after
> + * the system goes down.
> + */
> +struct xfs_commit_range {
> +	__s32		file1_fd;
> +	__u32		pad;		/* must be zeroes */
> +	__u64		file1_offset;	/* file1 offset, bytes */
> +	__u64		file2_offset;	/* file2 offset, bytes */
> +	__u64		length;		/* bytes to exchange */
> +
> +	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
> +
> +	/* opaque file2 metadata for freshness checks */
> +	__u64		file2_freshness[6];
> +};
> +
>  /*
>   * Exchange file data all the way to the ends of both files, and then exchange
>   * the file sizes.  This flag can be used to replace a file's contents with a
> @@ -997,6 +1021,8 @@ struct xfs_getparents_by_handle {
>  #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
>  #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
>  #define XFS_IOC_EXCHANGE_RANGE	     _IOW ('X', 129, struct xfs_exchange_range)
> +#define XFS_IOC_START_COMMIT	     _IOR ('X', 130, struct xfs_commit_range)
> +#define XFS_IOC_COMMIT_RANGE	     _IOW ('X', 131, struct xfs_commit_range)
>  /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
>  
>  
> diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
> index c8a655c92c92..d0889190ab7f 100644
> --- a/fs/xfs/xfs_exchrange.c
> +++ b/fs/xfs/xfs_exchrange.c
> @@ -72,6 +72,34 @@ xfs_exchrange_estimate(
>  	return error;
>  }
>  
> +/*
> + * Check that file2's metadata agree with the snapshot that we took for the
> + * range commit request.
> + *
> + * This should be called after the filesystem has locked /all/ inode metadata
> + * against modification.
> + */
> +STATIC int
> +xfs_exchrange_check_freshness(
> +	const struct xfs_exchrange	*fxr,
> +	struct xfs_inode		*ip2)
> +{
> +	struct inode			*inode2 = VFS_I(ip2);
> +	struct timespec64		ctime = inode_get_ctime(inode2);
> +	struct timespec64		mtime = inode_get_mtime(inode2);
> +
> +	trace_xfs_exchrange_freshness(fxr, ip2);
> +
> +	/* Check that file2 hasn't otherwise been modified. */
> +	if (fxr->file2_ino != ip2->i_ino ||
> +	    fxr->file2_gen != inode2->i_generation ||
> +	    !timespec64_equal(&fxr->file2_ctime, &ctime) ||
> +	    !timespec64_equal(&fxr->file2_mtime, &mtime))
> +		return -EBUSY;
> +
> +	return 0;
> +}
> +
>  #define QRETRY_IP1	(0x1)
>  #define QRETRY_IP2	(0x2)
>  
> @@ -607,6 +635,12 @@ xfs_exchrange_prep(
>  	if (error || fxr->length == 0)
>  		return error;
>  
> +	if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
> +		error = xfs_exchrange_check_freshness(fxr, ip2);
> +		if (error)
> +			return error;
> +	}
> +
>  	/* Attach dquots to both inodes before changing block maps. */
>  	error = xfs_qm_dqattach(ip2);
>  	if (error)
> @@ -719,7 +753,8 @@ xfs_exchange_range(
>  	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
>  		return -EXDEV;
>  
> -	if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
> +	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
> +			 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
>  		return -EINVAL;
>  
>  	/* Userspace requests only honored for regular files. */
> @@ -802,3 +837,109 @@ xfs_ioc_exchange_range(
>  	fdput(file1);
>  	return error;
>  }
> +
> +/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
> +struct xfs_commit_range_fresh {
> +	xfs_fsid_t	fsid;		/* m_fixedfsid */
> +	__u64		file2_ino;	/* inode number */
> +	__s64		file2_mtime;	/* modification time */
> +	__s64		file2_ctime;	/* change time */
> +	__s32		file2_mtime_nsec; /* mod time, nsec */
> +	__s32		file2_ctime_nsec; /* change time, nsec */
> +	__u32		file2_gen;	/* inode generation */
> +	__u32		magic;		/* zero */
> +};
> +#define XCR_FRESH_MAGIC	0x444F524B	/* DORK */
> +
> +/* Set up a commitrange operation by sampling file2's write-related attrs */
> +long
> +xfs_ioc_start_commit(
> +	struct file			*file,
> +	struct xfs_commit_range __user	*argp)
> +{
> +	struct xfs_commit_range		args = { };
> +	struct timespec64		ts;
> +	struct xfs_commit_range_fresh	*kern_f;
> +	struct xfs_commit_range_fresh	__user *user_f;
> +	struct inode			*inode2 = file_inode(file);
> +	struct xfs_inode		*ip2 = XFS_I(inode2);
> +	const unsigned int		lockflags = XFS_IOLOCK_SHARED |
> +						    XFS_MMAPLOCK_SHARED |
> +						    XFS_ILOCK_SHARED;
> +
> +	BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
> +		     sizeof(args.file2_freshness));
> +
> +	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
> +
> +	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
> +
> +	xfs_ilock(ip2, lockflags);
> +	ts = inode_get_ctime(inode2);
> +	kern_f->file2_ctime		= ts.tv_sec;
> +	kern_f->file2_ctime_nsec	= ts.tv_nsec;
> +	ts = inode_get_mtime(inode2);
> +	kern_f->file2_mtime		= ts.tv_sec;
> +	kern_f->file2_mtime_nsec	= ts.tv_nsec;
> +	kern_f->file2_ino		= ip2->i_ino;
> +	kern_f->file2_gen		= inode2->i_generation;
> +	kern_f->magic			= XCR_FRESH_MAGIC;
> +	xfs_iunlock(ip2, lockflags);
> +
> +	user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
> +	if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +/*
> + * Exchange file1 and file2 contents if file2 has not been written since the
> + * start commit operation.
> + */
> +long
> +xfs_ioc_commit_range(
> +	struct file			*file,
> +	struct xfs_commit_range __user	*argp)
> +{
> +	struct xfs_exchrange		fxr = {
> +		.file2			= file,
> +	};
> +	struct xfs_commit_range		args;
> +	struct xfs_commit_range_fresh	*kern_f;
> +	struct xfs_inode		*ip2 = XFS_I(file_inode(file));
> +	struct xfs_mount		*mp = ip2->i_mount;
> +	struct fd			file1;
> +	int				error;
> +
> +	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
> +
> +	if (copy_from_user(&args, argp, sizeof(args)))
> +		return -EFAULT;
> +	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
> +		return -EINVAL;
> +	if (kern_f->magic != XCR_FRESH_MAGIC)
> +		return -EBUSY;
> +	if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
> +		return -EBUSY;

So, I mentioned this before in another mail a few months ago and I think
you liked the idea so just as a reminder in case you forgot:

Ioctls are extensible if done correctly:

switch (__IOC_NR(ioctl)) {
	case _IOC_NR(XFS_IOC_START_COMMIT): {
	        size_t usize = _IOC_SIZE(ioctl);
		struct xfs_commit_range	args;

		if (usize < XFS_IOC_START_COMMIT_SIZE_VER0)
                        return -EINVAL;

		if (copy_struct_from_user(&args, sizeof(args), argp, usize))
			return -EFAULT;
	}

If you code it this way, relying on copy_struct_from_user() right from
the start you can easily extend your struct in a backward and forward
compatible manner.

}

> +
> +	fxr.file1_offset	= args.file1_offset;
> +	fxr.file2_offset	= args.file2_offset;
> +	fxr.length		= args.length;
> +	fxr.flags		= args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
> +	fxr.file2_ino		= kern_f->file2_ino;
> +	fxr.file2_gen		= kern_f->file2_gen;
> +	fxr.file2_mtime.tv_sec	= kern_f->file2_mtime;
> +	fxr.file2_mtime.tv_nsec	= kern_f->file2_mtime_nsec;
> +	fxr.file2_ctime.tv_sec	= kern_f->file2_ctime;
> +	fxr.file2_ctime.tv_nsec	= kern_f->file2_ctime_nsec;
> +
> +	file1 = fdget(args.file1_fd);
> +	if (!file1.file)
> +		return -EBADF;

Please use CLASS(fd, f)(args.file1_fd) :)

> +	fxr.file1 = file1.file;
> +
> +	error = xfs_exchange_range(&fxr);
> +	fdput(file1);
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
> index 039abcca546e..bc1298aba806 100644
> --- a/fs/xfs/xfs_exchrange.h
> +++ b/fs/xfs/xfs_exchrange.h
> @@ -10,8 +10,12 @@
>  #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1	(1ULL << 63)
>  #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2	(1ULL << 62)
>  
> +/* Freshness check required */
> +#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2	(1ULL << 61)
> +
>  #define XFS_EXCHANGE_RANGE_PRIV_FLAGS	(__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
> -					 __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
> +					 __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \
> +					 __XFS_EXCHANGE_RANGE_CHECK_FRESH2)
>  
>  struct xfs_exchrange {
>  	struct file		*file1;
> @@ -22,10 +26,20 @@ struct xfs_exchrange {
>  	u64			length;
>  
>  	u64			flags;	/* XFS_EXCHANGE_RANGE flags */
> +
> +	/* file2 metadata for freshness checks */
> +	u64			file2_ino;
> +	struct timespec64	file2_mtime;
> +	struct timespec64	file2_ctime;
> +	u32			file2_gen;
>  };
>  
>  long xfs_ioc_exchange_range(struct file *file,
>  		struct xfs_exchange_range __user *argp);
> +long xfs_ioc_start_commit(struct file *file,
> +		struct xfs_commit_range __user *argp);
> +long xfs_ioc_commit_range(struct file *file,
> +		struct xfs_commit_range __user	*argp);
>  
>  struct xfs_exchmaps_req;
>  
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 6b13666d4e96..90b3ee21e7fe 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1518,6 +1518,10 @@ xfs_file_ioctl(
>  
>  	case XFS_IOC_EXCHANGE_RANGE:
>  		return xfs_ioc_exchange_range(filp, arg);
> +	case XFS_IOC_START_COMMIT:
> +		return xfs_ioc_start_commit(filp, arg);
> +	case XFS_IOC_COMMIT_RANGE:
> +		return xfs_ioc_commit_range(filp, arg);
>  
>  	default:
>  		return -ENOTTY;
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 180ce697305a..4cf0fa71ba9c 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
>  	{ XFS_EXCHANGE_RANGE_DRY_RUN,		"DRY_RUN" }, \
>  	{ XFS_EXCHANGE_RANGE_FILE1_WRITTEN,	"F1_WRITTEN" }, \
>  	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME1,	"CMTIME1" }, \
> -	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2,	"CMTIME2" }
> +	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2,	"CMTIME2" }, \
> +	{ __XFS_EXCHANGE_RANGE_CHECK_FRESH2,	"FRESH2" }
>  
>  /* file exchange-range tracepoint class */
>  DECLARE_EVENT_CLASS(xfs_exchrange_class,
> @@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
>  DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
>  DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
>  
> +TRACE_EVENT(xfs_exchrange_freshness,
> +	TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
> +	TP_ARGS(fxr, ip2),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_ino_t, ip2_ino)
> +		__field(long long, ip2_mtime)
> +		__field(long long, ip2_ctime)
> +		__field(int, ip2_mtime_nsec)
> +		__field(int, ip2_ctime_nsec)
> +
> +		__field(xfs_ino_t, file2_ino)
> +		__field(long long, file2_mtime)
> +		__field(long long, file2_ctime)
> +		__field(int, file2_mtime_nsec)
> +		__field(int, file2_ctime_nsec)
> +	),
> +	TP_fast_assign(
> +		struct timespec64	ts64;
> +		struct inode		*inode2 = VFS_I(ip2);
> +
> +		__entry->dev = inode2->i_sb->s_dev;
> +		__entry->ip2_ino = ip2->i_ino;
> +
> +		ts64 = inode_get_ctime(inode2);
> +		__entry->ip2_ctime = ts64.tv_sec;
> +		__entry->ip2_ctime_nsec = ts64.tv_nsec;
> +
> +		ts64 = inode_get_mtime(inode2);
> +		__entry->ip2_mtime = ts64.tv_sec;
> +		__entry->ip2_mtime_nsec = ts64.tv_nsec;
> +
> +		__entry->file2_ino = fxr->file2_ino;
> +		__entry->file2_mtime = fxr->file2_mtime.tv_sec;
> +		__entry->file2_ctime = fxr->file2_ctime.tv_sec;
> +		__entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
> +		__entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
> +	),
> +	TP_printk("dev %d:%d "
> +		  "ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
> +		  "file 0x%llx mtime %lld:%d ctime %lld:%d",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->ip2_ino,
> +		  __entry->ip2_mtime,
> +		  __entry->ip2_mtime_nsec,
> +		  __entry->ip2_ctime,
> +		  __entry->ip2_ctime_nsec,
> +		  __entry->file2_ino,
> +		  __entry->file2_mtime,
> +		  __entry->file2_mtime_nsec,
> +		  __entry->file2_ctime,
> +		  __entry->file2_ctime_nsec)
> +);
> +
>  TRACE_EVENT(xfs_exchmaps_overhead,
>  	TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
>  		 unsigned long long rmapbt_blocks),
> 




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux