Re: [PATCH v1] xfs: allow read IO and FICLONE to run concurrently

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Wed, 4 Oct 2023 17:04:06 -0700

On Wed, Oct 04, 2023 at 01:58:07PM -0700, Catherine Hoang wrote:
> Clone operations and read IO do not change any data in the source file, so they
> should be able to run concurrently. Demote the exclusive locks taken by FICLONE
> to shared locks to allow reads while cloning. While a clone is in progress,
> writes will take the IOLOCK_EXCL, so they block until the clone completes.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@xxxxxxxxxx>
> ---
>  fs/xfs/xfs_file.c    | 41 +++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_inode.h   |  3 +++
>  fs/xfs/xfs_reflink.c | 31 +++++++++++++++++++++++++++++++
>  fs/xfs/xfs_reflink.h |  2 ++
>  4 files changed, 71 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 203700278ddb..1ec987fcabb9 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -554,6 +554,15 @@ xfs_file_dio_write_aligned(
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> +
> +	if (xfs_iflags_test(ip, XFS_ICLONING)) {
> +		xfs_iunlock(ip, iolock);
> +		iolock = XFS_IOLOCK_EXCL;
> +		ret = xfs_ilock_iocb(iocb, iolock);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out_unlock;
> @@ -563,7 +572,7 @@ xfs_file_dio_write_aligned(
>  	 * the iolock back to shared if we had to take the exclusive lock in
>  	 * xfs_file_write_checks() for other reasons.
>  	 */
> -	if (iolock == XFS_IOLOCK_EXCL) {
> +	if (iolock == XFS_IOLOCK_EXCL && !xfs_iflags_test(ip, XFS_ICLONING)) {
>  		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
>  		iolock = XFS_IOLOCK_SHARED;
>  	}
> @@ -622,6 +631,14 @@ xfs_file_dio_write_unaligned(
>  	if (ret)
>  		return ret;
>  
> +	if (xfs_iflags_test(ip, XFS_ICLONING) && iolock != XFS_IOLOCK_EXCL) {

xfs_iflags_test cycles a spinlock; you ought to put the cheaper
condition (the iolock check) first so that the compiler's shorcutting
can make this cheaper.

> +		xfs_iunlock(ip, iolock);
> +		iolock = XFS_IOLOCK_EXCL;
> +		ret = xfs_ilock_iocb(iocb, iolock);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	/*
>  	 * We can't properly handle unaligned direct I/O to reflink files yet,
>  	 * as we can't unshare a partial block.
> @@ -1180,7 +1197,8 @@ xfs_file_remap_range(
>  	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
>  		xfs_log_force_inode(dest);
>  out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_reflink_unlock(src, dest);
> +	xfs_iflags_clear(src, XFS_ICLONING);

Clear the ICLONING flag before dropping the locks so that a caller
trying to get IOLOCK_EXCL won't wake up until the flag is really gone.

>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return remapped > 0 ? remapped : ret;
> @@ -1328,6 +1346,7 @@ __xfs_filemap_fault(
>  	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	vm_fault_t		ret;
> +	uint                    mmaplock = XFS_MMAPLOCK_SHARED;
>  
>  	trace_xfs_filemap_fault(ip, order, write_fault);
>  
> @@ -1339,17 +1358,27 @@ __xfs_filemap_fault(
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> -		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		xfs_ilock(XFS_I(inode), mmaplock);
> +		if (xfs_iflags_test(ip, XFS_ICLONING)) {
> +			xfs_iunlock(ip, mmaplock);
> +			mmaplock = XFS_MMAPLOCK_EXCL;
> +			xfs_ilock(XFS_I(inode), mmaplock);
> +		}
>  		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, order, pfn);
> -		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		xfs_iunlock(XFS_I(inode), mmaplock);
>  	} else {
>  		if (write_fault) {
> -			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +			xfs_ilock(XFS_I(inode), mmaplock);
> +			if (xfs_iflags_test(ip, XFS_ICLONING)) {
> +				xfs_iunlock(ip, mmaplock);
> +				mmaplock = XFS_MMAPLOCK_EXCL;
> +				xfs_ilock(XFS_I(inode), mmaplock);
> +			}
>  			ret = iomap_page_mkwrite(vmf,
>  					&xfs_page_mkwrite_iomap_ops);
> -			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +			xfs_iunlock(XFS_I(inode), mmaplock);
>  		} else {
>  			ret = filemap_fault(vmf);
>  		}
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 0c5bdb91152e..e3ff059c69f7 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -347,6 +347,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  /* Quotacheck is running but inode has not been added to quota counts. */
>  #define XFS_IQUOTAUNCHECKED	(1 << 14)
>  
> +/* Clone in progress, do not allow modifications. */
> +#define XFS_ICLONING (1 << 15)

Somewhere we need to capture the locking behavior around this flag.

/*
 * Remap in progress.  Callers that wish to update file data while
 * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
 * the lock in exclusive mode.  Relocking the file will block until
 * ICLONING is cleared.
 */
#define XFS_IREMAPPING		(1U << 15)

On second thought, perhaps this should be called XFS_IREMAPPING?  Since
these semantics apply to dedupe in addition to clone.

> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index eb9102453aff..645cc196ee13 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
>  	if (ret)
>  		goto out_unlock;
>  
> +	xfs_iflags_set(src, XFS_ICLONING);
> +	if (inode_in != inode_out)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> +
>  	return 0;
>  out_unlock:
>  	xfs_iunlock2_io_mmap(src, dest);
> @@ -1718,3 +1722,30 @@ xfs_reflink_unshare(
>  	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
>  	return error;
>  }
> +
> +/* Unlock both inodes after the reflink completes. */
> +void
> +xfs_reflink_unlock(
> +	struct xfs_inode	*ip1,
> +	struct xfs_inode	*ip2)
> +{
> +	struct address_space	*mapping1;
> +	struct address_space	*mapping2;
> +
> +	if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
> +		if (ip1 != ip2)
> +			xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
> +		xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
> +	} else {
> +		mapping1 = VFS_I(ip1)->i_mapping;
> +		mapping2 = VFS_I(ip2)->i_mapping;
> +		if (mapping1 && mapping1 != mapping2)
> +			up_read(&mapping1->invalidate_lock);
> +		if (mapping2)
> +			up_write(&mapping2->invalidate_lock);

XFS_MMAPLOCK is the same thing mapping->invalidate-lock; won't
the first version work for the !DAX case too?

--D

> +	}
> +
> +	if (ip1 != ip2)
> +		inode_unlock_shared(VFS_I(ip1));
> +	inode_unlock(VFS_I(ip2));
> +}
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 65c5dfe17ecf..89f4d2a2f52e 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -53,4 +53,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
>  extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
>  		xfs_extlen_t cowextsize, unsigned int remap_flags);
>  
> +void xfs_reflink_unlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +
>  #endif /* __XFS_REFLINK_H */
> -- 
> 2.34.1
>