On Wed, Oct 11, 2023 at 06:08:45PM -0700, Catherine Hoang wrote: > Clone operations and read IO do not change any data in the source file, so they > should be able to run concurrently. Demote the exclusive locks taken by FICLONE > to shared locks to allow reads while cloning. While a clone is in progress, > writes will take the IOLOCK_EXCL, so they block until the clone completes. > > Signed-off-by: Catherine Hoang <catherine.hoang@xxxxxxxxxx> > --- > fs/xfs/xfs_file.c | 68 +++++++++++++++++++++++++++++++++++--------- > fs/xfs/xfs_inode.c | 17 +++++++++++ > fs/xfs/xfs_inode.h | 9 ++++++ > fs/xfs/xfs_reflink.c | 4 +++ > 4 files changed, 85 insertions(+), 13 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 203700278ddb..425507b0d1cb 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -214,6 +214,48 @@ xfs_ilock_iocb( > return 0; > } > > +static int > +xfs_ilock_iocb_for_write( > + struct kiocb *iocb, > + unsigned int *lock_mode) > +{ > + ssize_t ret; > + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); > + > + do { > + ret = xfs_ilock_iocb(iocb, *lock_mode); > + if (ret) > + return ret; > + if (*lock_mode == XFS_IOLOCK_EXCL) > + return 0; > + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) > + return 0; > + xfs_iunlock(ip, *lock_mode); > + *lock_mode = XFS_IOLOCK_EXCL; > + } while (1); > + /* notreached */ Might want to change that to an ASSERT(0) so we'll notice if this ever breaks. > + return -EAGAIN; > +} > + > +static int "unsigned int", please be consistent with the other inode lock flag usage around the codebase. > +xfs_ilock_for_write_fault( > + struct xfs_inode *ip) > +{ > + int lock_mode = XFS_MMAPLOCK_SHARED; > + > + do { > + xfs_ilock(ip, lock_mode); > + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) > + return lock_mode; > + if (lock_mode == XFS_MMAPLOCK_EXCL) > + return lock_mode; > + xfs_iunlock(ip, lock_mode); > + lock_mode = XFS_MMAPLOCK_EXCL; > + } while (1); > + /* notreached */ > + return 0; I wonder if this would be easier to understand with the loop unrolled? /* get a shared lock if no remapping in progress */ xfs_ilock(ip, XFS_MMAPLOCK_SHARED); if (!xfs_iflags_test(ip, XFS_IREMAPPING)) return XFS_MMAPLOCK_SHARED; /* wait for remapping to complete */ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); return XFS_MMAPLOCK_EXCL; It eliminates the awkward /* notreached */ construction. Thoughts? > +} > + > STATIC ssize_t > xfs_file_dio_read( > struct kiocb *iocb, > @@ -551,7 +593,7 @@ xfs_file_dio_write_aligned( > unsigned int iolock = XFS_IOLOCK_SHARED; > ssize_t ret; > > - ret = xfs_ilock_iocb(iocb, iolock); > + ret = xfs_ilock_iocb_for_write(iocb, &iolock); > if (ret) > return ret; > ret = xfs_file_write_checks(iocb, from, &iolock); > @@ -618,7 +660,7 @@ xfs_file_dio_write_unaligned( > flags = IOMAP_DIO_FORCE_WAIT; > } > > - ret = xfs_ilock_iocb(iocb, iolock); > + ret = xfs_ilock_iocb_for_write(iocb, &iolock); > if (ret) > return ret; > > @@ -1180,7 +1222,7 @@ xfs_file_remap_range( > if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) > xfs_log_force_inode(dest); > out_unlock: > - xfs_iunlock2_io_mmap(src, dest); > + xfs_iunlock2_remapping(src, dest); > if (ret) > trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); > return remapped > 0 ? remapped : ret; > @@ -1328,6 +1370,7 @@ __xfs_filemap_fault( > struct inode *inode = file_inode(vmf->vma->vm_file); > struct xfs_inode *ip = XFS_I(inode); > vm_fault_t ret; > + int lock_mode = 0; unsigned int --D > > trace_xfs_filemap_fault(ip, order, write_fault); > > @@ -1336,25 +1379,24 @@ __xfs_filemap_fault( > file_update_time(vmf->vma->vm_file); > } > > + if (IS_DAX(inode) || write_fault) > + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); > + > if (IS_DAX(inode)) { > pfn_t pfn; > > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > ret = xfs_dax_fault(vmf, order, write_fault, &pfn); > if (ret & VM_FAULT_NEEDDSYNC) > ret = dax_finish_sync_fault(vmf, order, pfn); > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > + } else if (write_fault) { > + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); > } else { > - if (write_fault) { > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > - ret = iomap_page_mkwrite(vmf, > - &xfs_page_mkwrite_iomap_ops); > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > - } else { > - ret = filemap_fault(vmf); > - } > + ret = filemap_fault(vmf); > } > > + if (lock_mode) > + xfs_iunlock(XFS_I(inode), lock_mode); > + > if (write_fault) > sb_end_pagefault(inode->i_sb); > return ret; > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 4d55f58d99b7..97b0078249fd 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -3621,6 +3621,23 @@ xfs_iunlock2_io_mmap( > inode_unlock(VFS_I(ip1)); > } > > +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ > +void > +xfs_iunlock2_remapping( > + struct xfs_inode *ip1, > + struct xfs_inode *ip2) > +{ > + xfs_iflags_clear(ip1, XFS_IREMAPPING); > + > + if (ip1 != ip2) > + xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); > + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); > + > + if (ip1 != ip2) > + inode_unlock_shared(VFS_I(ip1)); > + inode_unlock(VFS_I(ip2)); > +} > + > /* > * Reload the incore inode list for this inode. Caller should ensure that > * the link count cannot change, either by taking ILOCK_SHARED or otherwise > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 0c5bdb91152e..3dc47937da5d 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > /* Quotacheck is running but inode has not been added to quota counts. */ > #define XFS_IQUOTAUNCHECKED (1 << 14) > > +/* > + * Remap in progress. Callers that wish to update file data while > + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake > + * the lock in exclusive mode. Relocking the file will block until > + * IREMAPPING is cleared. > + */ > +#define XFS_IREMAPPING (1U << 15) > + > /* All inode state flags related to inode reclaim. */ > #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ > XFS_IRECLAIM | \ > @@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work); > > int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); > > static inline bool > xfs_inode_unlinked_incomplete( > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index eb9102453aff..658edee8381d 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep( > if (ret) > goto out_unlock; > > + xfs_iflags_set(src, XFS_IREMAPPING); > + if (inode_in != inode_out) > + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); > + > return 0; > out_unlock: > xfs_iunlock2_io_mmap(src, dest); > -- > 2.34.1 >