Switch ext4 from the magic i_dio_count scheme to just hold i_rwsem until the actual I/O has completed to reduce the locking complexity and avoid nasty bugs due to missing inode_dio_wait calls. Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- fs/xfs/scrub/bmap.c | 1 - fs/xfs/xfs_bmap_util.c | 3 --- fs/xfs/xfs_file.c | 47 +++++++++++++----------------------------- fs/xfs/xfs_icache.c | 3 +-- fs/xfs/xfs_ioctl.c | 1 - fs/xfs/xfs_iops.c | 5 ----- fs/xfs/xfs_reflink.c | 2 -- 7 files changed, 15 insertions(+), 47 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa6ea6407992..d3e4068d3189 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -45,7 +45,6 @@ xchk_setup_inode_bmap( */ if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { - inode_dio_wait(VFS_I(sc->ip)); error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); if (error) goto out; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e62fb5216341..a454f481107e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -674,9 +674,6 @@ xfs_free_eofblocks( if (error) return error; - /* wait on dio to ensure i_size has settled */ - inode_dio_wait(VFS_I(ip)); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0cc843a4a163..d0ee7d2932e4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -193,9 +193,11 @@ xfs_file_dio_aio_read( } else { xfs_ilock(ip, XFS_IOLOCK_SHARED); } - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, + IOMAP_DIO_RWSEM_SHARED); + if (ret != -EIOCBQUEUED) + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -341,15 +343,6 @@ xfs_file_aio_write_checks( xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); drained_dio = true; goto restart; } @@ -469,13 +462,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in - * the dio layer. To avoid the problem with aio, we also need to wait for - * outstanding IOs to complete so that unwritten extent conversion is completed - * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. inode_dio_wait()). - * - * Returns with locks held indicated by @iolock and errors indicated by - * negative return values. + * the dio layer. */ STATIC ssize_t xfs_file_dio_aio_write( @@ -546,18 +533,21 @@ xfs_file_dio_aio_write( * xfs_file_aio_write_checks() for other reasons. */ if (unaligned_io) { - inode_dio_wait(inode); - dio_flags = IOMAP_DIO_SYNCHRONOUS; - } else if (iolock == XFS_IOLOCK_EXCL) { - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + dio_flags = IOMAP_DIO_RWSEM_EXCL | IOMAP_DIO_SYNCHRONOUS; + } else { + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + dio_flags = IOMAP_DIO_RWSEM_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, dio_flags); out: - xfs_iunlock(ip, iolock); + if (ret != -EIOCBQUEUED) + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -819,15 +809,6 @@ xfs_file_fallocate( if (error) goto out_unlock; - /* - * Must wait for all AIO to complete before we continue as AIO can - * change the file size on completion without holding any locks we - * currently hold. We must do this first because AIO can update both - * the on disk and in memory inode sizes, and the operations that follow - * require the in-memory size to be fully up-to-date. - */ - inode_dio_wait(inode); - /* * Now AIO and DIO has drained we flush and (if necessary) invalidate * the cached range over the first operation we are about to run. diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8dc2e5414276..9e6f32fd32f5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1720,8 +1720,7 @@ xfs_prep_free_cowblocks( */ if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || - atomic_read(&VFS_I(ip)->i_dio_count)) + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK)) return false; return true; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7b35d62ede9f..331453f2c4be 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -548,7 +548,6 @@ xfs_ioc_space( error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - inode_dio_wait(inode); switch (bf->l_whence) { case 0: /*SEEK_SET*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8afe69ca188b..700edeccc6bf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -893,11 +893,6 @@ xfs_setattr_size( if (error) return error; - /* - * Wait for all direct I/O to complete. - */ - inode_dio_wait(inode); - /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index de451235c4ee..f775e60ca6f7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1525,8 +1525,6 @@ xfs_reflink_unshare( trace_xfs_reflink_unshare(ip, offset, len); - inode_dio_wait(inode); - error = iomap_file_unshare(inode, offset, len, &xfs_buffered_write_iomap_ops); if (error) -- 2.24.1