We only need the exclusive iolock for direct writes to protect sub-block zeroing after an allocation or conversion of unwritten extents, and the synchronous execution of these writes is also only needed because the iolock is dropped early for the dodgy i_dio_count synchronisation. Always start out with the shared iolock in xfs_file_dio_aio_write for non-appending writes and only upgrade it to exclusive if the start and end of the write range are not already allocated and in written state. This means one or two extra lookups in the in-core extent tree, but with our btree data structure those lookups are very cheap and do not show up in profiles on NVMe hardware for me. On the other hand avoiding the lock allows for a high concurrency using aio or io_uring. Signed-off-by: Christoph Hellwig <hch@xxxxxx> Reported-by: Avi Kivity <avi@xxxxxxxxxxxx> Suggested-by: Brian Foster <bfoster@xxxxxxxxxx> --- fs/xfs/xfs_file.c | 127 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 96 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1470fc4f2e0255..59d4c6e90f06c1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -521,6 +521,57 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { .end_io = xfs_dio_write_end_io, }; +static int +xfs_dio_write_exclusive( + struct kiocb *iocb, + size_t count, + bool *exclusive_io) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = &ip->i_df; + loff_t offset = iocb->ki_pos; + loff_t end = offset + count; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end); + struct xfs_bmbt_irec got = { }; + struct xfs_iext_cursor icur; + int ret; + + *exclusive_io = true; + + /* + * Bmap information not read in yet or no blocks allocated at all? + */ + if (!(ifp->if_flags & XFS_IFEXTENTS) || !ip->i_d.di_nblocks) + return 0; + + ret = xfs_ilock_iocb(iocb, XFS_ILOCK_SHARED); + if (ret) + return ret; + + if (offset & mp->m_blockmask) { + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got) || + got.br_startoff > offset_fsb || + got.br_state == XFS_EXT_UNWRITTEN) + goto out_unlock; + } + + if ((end & mp->m_blockmask) && + got.br_startoff + got.br_blockcount <= end_fsb) { + if (!xfs_iext_lookup_extent(ip, ifp, end_fsb, &icur, &got) || + got.br_startoff > end_fsb || + got.br_state == XFS_EXT_UNWRITTEN) + goto out_unlock; + } + + *exclusive_io = false; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return ret; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -557,8 +608,9 @@ xfs_file_dio_aio_write( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; - int unaligned_io = 0; - int iolock; + int iolock = XFS_IOLOCK_SHARED; + bool subblock_io = false; + bool exclusive_io = false; size_t count = iov_iter_count(from); struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -566,45 +618,58 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * Don't take the exclusive iolock here unless the I/O is unaligned to - * the file system block size. We don't need to consider the EOF - * extension case here because xfs_file_aio_write_checks() will relock - * the inode as necessary for EOF zeroing cases and fill out the new - * inode size as appropriate. - */ + /* I/O that is not aligned to the fsblock size will need special care */ if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) { - unaligned_io = 1; + ((iocb->ki_pos + count) & mp->m_blockmask)) + subblock_io = true; - /* - * We can't properly handle unaligned direct I/O to reflink - * files yet, as we can't unshare a partial block. - */ - if (xfs_is_cow_inode(ip)) { - trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); - return -ENOTBLK; - } - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; + /* + * We can't properly handle unaligned direct I/O to reflink files yet, + * as we can't unshare a partial block. + */ + if (subblock_io && xfs_is_cow_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + return -ENOTBLK; } - if (iocb->ki_flags & IOCB_NOWAIT) { - /* unaligned dio always waits, bail */ - if (unaligned_io) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, iolock)) + /* + * Racy shortcut for obvious appends to avoid too much relocking: + */ + if (iocb->ki_pos > i_size_read(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; - } else { - xfs_ilock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; } +relock: + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); + /* + * Upgrade to an exclusive lock and force synchronous completion if the + * I/O will require partial block zeroing. + * We don't need to consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (iolock != XFS_IOLOCK_EXCL && subblock_io) { + ret = xfs_dio_write_exclusive(iocb, count, &exclusive_io); + if (ret) + goto out; + if (exclusive_io) { + xfs_iunlock(ip, iolock); + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + iolock = XFS_IOLOCK_EXCL; + goto relock; + } + } + /* * If we are doing unaligned IO, we can't allow any other overlapping IO * in-flight at the same time or we risk data corruption. Wait for all @@ -612,7 +677,7 @@ xfs_file_dio_aio_write( * iolock if we had to take the exclusive lock in * xfs_file_aio_write_checks() for other reasons. */ - if (unaligned_io) { + if (exclusive_io) { inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); @@ -626,7 +691,7 @@ xfs_file_dio_aio_write( */ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_io); + is_sync_kiocb(iocb) || exclusive_io); out: if (iolock) xfs_iunlock(ip, iolock); -- 2.29.2