On Tue, Oct 17, 2023 at 01:12:08PM -0700, Catherine Hoang wrote: > One of our VM cluster management products needs to snapshot KVM image > files so that they can be restored in case of failure. Snapshotting is > done by redirecting VM disk writes to a sidecar file and using reflink > on the disk image, specifically the FICLONE ioctl as used by > "cp --reflink". Reflink locks the source and destination files while it > operates, which means that reads from the main vm disk image are blocked, > causing the vm to stall. When an image file is heavily fragmented, the > copy process could take several minutes. Some of the vm image files have > 50-100 million extent records, and duplicating that much metadata locks > the file for 30 minutes or more. Having activities suspended for such > a long time in a cluster node could result in node eviction. > > Clone operations and read IO do not change any data in the source file, > so they should be able to run concurrently. Demote the exclusive locks > taken by FICLONE to shared locks to allow reads while cloning. While a > clone is in progress, writes will take the IOLOCK_EXCL, so they block > until the clone completes. > > Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@xxxxxxxxxx/ > Signed-off-by: Catherine Hoang <catherine.hoang@xxxxxxxxxx> Looks good to me, Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx> --D > --- > fs/xfs/xfs_file.c | 67 +++++++++++++++++++++++++++++++++++--------- > fs/xfs/xfs_inode.c | 17 +++++++++++ > fs/xfs/xfs_inode.h | 9 ++++++ > fs/xfs/xfs_reflink.c | 4 +++ > 4 files changed, 84 insertions(+), 13 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 203700278ddb..3b9500e18f90 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -214,6 +214,47 @@ xfs_ilock_iocb( > return 0; > } > > +static int > +xfs_ilock_iocb_for_write( > + struct kiocb *iocb, > + unsigned int *lock_mode) > +{ > + ssize_t ret; > + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); > + > + ret = xfs_ilock_iocb(iocb, *lock_mode); > + if (ret) > + return ret; > + > + if (*lock_mode == XFS_IOLOCK_EXCL) > + return 0; > + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) > + return 0; > + > + xfs_iunlock(ip, *lock_mode); > + *lock_mode = XFS_IOLOCK_EXCL; > + ret = xfs_ilock_iocb(iocb, *lock_mode); > + if (ret) > + return ret; > + > + return 0; > +} > + > +static unsigned int > +xfs_ilock_for_write_fault( > + struct xfs_inode *ip) > +{ > + /* get a shared lock if no remapping in progress */ > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) > + return XFS_MMAPLOCK_SHARED; > + > + /* wait for remapping to complete */ > + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); > + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); > + return XFS_MMAPLOCK_EXCL; > +} > + > STATIC ssize_t > xfs_file_dio_read( > struct kiocb *iocb, > @@ -551,7 +592,7 @@ xfs_file_dio_write_aligned( > unsigned int iolock = XFS_IOLOCK_SHARED; > ssize_t ret; > > - ret = xfs_ilock_iocb(iocb, iolock); > + ret = xfs_ilock_iocb_for_write(iocb, &iolock); > if (ret) > return ret; > ret = xfs_file_write_checks(iocb, from, &iolock); > @@ -618,7 +659,7 @@ xfs_file_dio_write_unaligned( > flags = IOMAP_DIO_FORCE_WAIT; > } > > - ret = xfs_ilock_iocb(iocb, iolock); > + ret = xfs_ilock_iocb_for_write(iocb, &iolock); > if (ret) > return ret; > > @@ -1180,7 +1221,7 @@ xfs_file_remap_range( > if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) > xfs_log_force_inode(dest); > out_unlock: > - xfs_iunlock2_io_mmap(src, dest); > + xfs_iunlock2_remapping(src, dest); > if (ret) > trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); > return remapped > 0 ? remapped : ret; > @@ -1328,6 +1369,7 @@ __xfs_filemap_fault( > struct inode *inode = file_inode(vmf->vma->vm_file); > struct xfs_inode *ip = XFS_I(inode); > vm_fault_t ret; > + unsigned int lock_mode = 0; > > trace_xfs_filemap_fault(ip, order, write_fault); > > @@ -1336,25 +1378,24 @@ __xfs_filemap_fault( > file_update_time(vmf->vma->vm_file); > } > > + if (IS_DAX(inode) || write_fault) > + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); > + > if (IS_DAX(inode)) { > pfn_t pfn; > > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > ret = xfs_dax_fault(vmf, order, write_fault, &pfn); > if (ret & VM_FAULT_NEEDDSYNC) > ret = dax_finish_sync_fault(vmf, order, pfn); > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > + } else if (write_fault) { > + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); > } else { > - if (write_fault) { > - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > - ret = iomap_page_mkwrite(vmf, > - &xfs_page_mkwrite_iomap_ops); > - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > - } else { > - ret = filemap_fault(vmf); > - } > + ret = filemap_fault(vmf); > } > > + if (lock_mode) > + xfs_iunlock(XFS_I(inode), lock_mode); > + > if (write_fault) > sb_end_pagefault(inode->i_sb); > return ret; > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 4d55f58d99b7..97b0078249fd 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -3621,6 +3621,23 @@ xfs_iunlock2_io_mmap( > inode_unlock(VFS_I(ip1)); > } > > +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ > +void > +xfs_iunlock2_remapping( > + struct xfs_inode *ip1, > + struct xfs_inode *ip2) > +{ > + xfs_iflags_clear(ip1, XFS_IREMAPPING); > + > + if (ip1 != ip2) > + xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); > + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); > + > + if (ip1 != ip2) > + inode_unlock_shared(VFS_I(ip1)); > + inode_unlock(VFS_I(ip2)); > +} > + > /* > * Reload the incore inode list for this inode. Caller should ensure that > * the link count cannot change, either by taking ILOCK_SHARED or otherwise > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 0c5bdb91152e..3dc47937da5d 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > /* Quotacheck is running but inode has not been added to quota counts. */ > #define XFS_IQUOTAUNCHECKED (1 << 14) > > +/* > + * Remap in progress. Callers that wish to update file data while > + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake > + * the lock in exclusive mode. Relocking the file will block until > + * IREMAPPING is cleared. > + */ > +#define XFS_IREMAPPING (1U << 15) > + > /* All inode state flags related to inode reclaim. */ > #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ > XFS_IRECLAIM | \ > @@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work); > > int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); > > static inline bool > xfs_inode_unlinked_incomplete( > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index eb9102453aff..658edee8381d 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep( > if (ret) > goto out_unlock; > > + xfs_iflags_set(src, XFS_IREMAPPING); > + if (inode_in != inode_out) > + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); > + > return 0; > out_unlock: > xfs_iunlock2_io_mmap(src, dest); > -- > 2.34.1 >