On Mon, Mar 02, 2020 at 12:26:44PM +1100, Dave Chinner wrote: > > /* > > diff --git a/mm/filemap.c b/mm/filemap.c > > index 1784478270e1..3a7863ba51b9 100644 > > --- a/mm/filemap.c > > +++ b/mm/filemap.c > > @@ -2293,6 +2293,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) > > * and return. Otherwise fallthrough to buffered io for > > * the rest of the read. Buffered reads will not work for > > * DAX files, so don't bother trying. > > + * > > + * IS_DAX is protected under ->read_iter lock > > */ > > if (retval < 0 || !count || iocb->ki_pos >= size || > > IS_DAX(inode)) > > This check is in the DIO path, be we can't do DIO on DAX enabled > files to begin with, so we can only get here if S_DAX is not set on > the file. > > Further, if IOCB_DIRECT is set, neither ext4 nor XFS call > generic_file_read_iter(); they run the iomap_dio_rw() path directly > instead. Only ext2 calls generic_file_read_iter() to do direct IO, > so it's the only filesystem that needs this IS_DAX() check in it. > > I think we should fix ext2 to be implemented like ext4 and XFS - > they implement the buffered IO fallback, should it be required, > themselves and never use generic_file_read_iter() for direct IO. > > That would allow us to add this to generic_file_read_iter(): > > if (WARN_ON_ONCE(IS_DAX(inode)) > return -EINVAL; > > to indicate that this should never be called directly on a DAX > capable filesystem. This places all the responsibility for managing > DAX behaviour on the filesystem, which then allows us to reason more > solidly about how the filesystem IO paths use and check the S_DAX > flag. > > i.e. changing the on-disk flag already locks out the filesystem IO > path via the i_rwsem(), and all the filesystem IO paths (buffered, > direct IO and dax) are serialised by this flag. Hence we can check > once in the filesystem path once we have the i_rwsem held and > know that S_DAX will not change until we release it. > > ..... and now I realise what I was sitting on the fence about.... > > I don't like the aops locking in call_read/write_iter() because it > is actually redundant: the filesystem should be doing the necessary > locking in the IO path via the i_rwsem to prevent S_DAX from > changing while it is doing the IO. > > IOWs, we need to restructure the locking inside the filesystem > read_iter and write_iter methods so that the i_rwsem protects the > S_DAX flag from changing dynamically. They all do: > > if (dax) > do_dax_io() > if (direct) > do_direct_io() > do_buffered_io() > > And then we take the i_rwsem inside each of those functions and do > the IO. What we actually need to do is something like this: > > inode_lock_shared() > if (dax) > do_dax_io() > if (direct) > do_direct_io() > do_buffered_io() > inode_unlock_shared() > > And remove the inode locking from inside the individual IO methods > themselves. It's a bit more complex than this because buffered > writes require exclusive locking, but this completely removes the > need for holding an aops lock over these methods. > > I've attached a couple of untested patches (I've compiled them, so > they must be good!) to demonstrate what I mean for the XFS IO path. > The read side removes a heap of duplicate code, but the write side > is .... unfortunately complex. Have to think about that more. And now with patches... Cheers, Dave. -- Dave Chinner david@xxxxxxxxxxxxx
xfs: pulling read IO path locking up to protect S_DAX checks From: Dave Chinner <dchinner@xxxxxxxxxx> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/xfs_file.c | 77 +++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 53 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b8a4a3f29b36..7d31a0e76b68 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -176,28 +176,12 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); - size_t count = iov_iter_count(to); - ssize_t ret; - - trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - - if (!count) - return 0; /* skip atime */ + trace_xfs_file_direct_read(XFS_I(file_inode(iocb->ki_filp)), + iov_iter_count(to), iocb->ki_pos); file_accessed(iocb->ki_filp); - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) - return -EAGAIN; - } else { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, + return iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, is_sync_kiocb(iocb)); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - return ret; } static noinline ssize_t @@ -205,27 +189,12 @@ xfs_file_dax_read( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); - size_t count = iov_iter_count(to); - ssize_t ret = 0; - - trace_xfs_file_dax_read(ip, count, iocb->ki_pos); - - if (!count) - return 0; /* skip atime */ - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) - return -EAGAIN; - } else { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + trace_xfs_file_dax_read(XFS_I(file_inode(iocb->ki_filp)), + iov_iter_count(to), iocb->ki_pos); file_accessed(iocb->ki_filp); - return ret; + return dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); + } STATIC ssize_t @@ -233,21 +202,10 @@ xfs_file_buffered_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); - ssize_t ret; - - trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + trace_xfs_file_buffered_read(XFS_I(file_inode(iocb->ki_filp)), + iov_iter_count(to), iocb->ki_pos); - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) - return -EAGAIN; - } else { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - ret = generic_file_read_iter(iocb, to); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - return ret; + return generic_file_read_iter(iocb, to); } STATIC ssize_t @@ -256,7 +214,8 @@ xfs_file_read_iter( struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_mount *mp = XFS_I(inode)->i_mount; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; XFS_STATS_INC(mp, xs_read_calls); @@ -264,6 +223,16 @@ xfs_file_read_iter( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } + if (IS_DAX(inode)) ret = xfs_file_dax_read(iocb, to); else if (iocb->ki_flags & IOCB_DIRECT) @@ -271,6 +240,8 @@ xfs_file_read_iter( else ret = xfs_file_buffered_aio_read(iocb, to); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); return ret;
xfs: pull write IO locking up to protect S_DAX From: Dave Chinner <dchinner@xxxxxxxxxx> Much more complex than the read side because of all the lock juggling we for the different types of writes and all the various metadata updates a write might need prior to dispatching the data IO. Not really happy about the way the high level logic turned out; could probably be make cleaner and smarter with a bit more thought. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/xfs_file.c | 205 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 115 insertions(+), 90 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7d31a0e76b68..b7c2bb09b945 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -250,9 +250,9 @@ xfs_file_read_iter( /* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to - * @iolock, and returns with it held. Might upgrade the iolock to exclusive - * if called for a direct write beyond i_size. + * Called without the iolocked held, but will lock it according to @iolock, and + * returns with it held on both success and error. Might upgrade the iolock to + * exclusive if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( @@ -268,6 +268,13 @@ xfs_file_aio_write_checks( bool drained_dio = false; loff_t isize; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, *iolock)) + return -EAGAIN; + } else { + xfs_ilock(ip, *iolock); + } + restart: error = generic_write_checks(iocb, from); if (error <= 0) @@ -325,7 +332,7 @@ xfs_file_aio_write_checks( drained_dio = true; goto restart; } - + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, NULL, &xfs_buffered_write_iomap_ops); @@ -449,19 +456,14 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ -STATIC ssize_t -xfs_file_dio_aio_write( +static int +xfs_file_dio_write_lock_mode( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; size_t count = iov_iter_count(from); struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -478,35 +480,37 @@ xfs_file_dio_aio_write( */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { - unaligned_io = 1; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* unaligned dio always waits, bail */ + return -EAGAIN; + } /* * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ if (xfs_is_cow_inode(ip)) { - trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, + count); return -EREMCHG; } - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; - } - - if (iocb->ki_flags & IOCB_NOWAIT) { - /* unaligned dio always waits, bail */ - if (unaligned_io) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, iolock)) - return -EAGAIN; - } else { - xfs_ilock(ip, iolock); + return XFS_IOLOCK_EXCL; } + return XFS_IOLOCK_SHARED; +} - ret = xfs_file_aio_write_checks(iocb, from, &iolock); - if (ret) - goto out; - count = iov_iter_count(from); +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + struct iov_iter *from, + int iolock) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + bool unaligned_io = 0; + size_t count = iov_iter_count(from); /* * If we are doing unaligned IO, we can't allow any other overlapping IO @@ -515,7 +519,9 @@ xfs_file_dio_aio_write( * iolock if we had to take the exclusive lock in * xfs_file_aio_write_checks() for other reasons. */ - if (unaligned_io) { + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = true; inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); @@ -530,7 +536,6 @@ xfs_file_dio_aio_write( ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, is_sync_kiocb(iocb) || unaligned_io); -out: xfs_iunlock(ip, iolock); /* @@ -544,26 +549,15 @@ xfs_file_dio_aio_write( static noinline ssize_t xfs_file_dax_write( struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + int iolock) { - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; size_t count; loff_t pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, iolock)) - return -EAGAIN; - } else { - xfs_ilock(ip, iolock); - } - - ret = xfs_file_aio_write_checks(iocb, from, &iolock); - if (ret) - goto out; - pos = iocb->ki_pos; count = iov_iter_count(from); @@ -573,7 +567,6 @@ xfs_file_dax_write( i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } -out: xfs_iunlock(ip, iolock); if (error) return error; @@ -590,26 +583,13 @@ xfs_file_dax_write( STATIC ssize_t xfs_file_buffered_aio_write( struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + int iolock, + bool flush_on_enospc) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; - int iolock; - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; - -write_retry: - iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - - ret = xfs_file_aio_write_checks(iocb, from, &iolock); - if (ret) - goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -617,8 +597,6 @@ xfs_file_buffered_aio_write( trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); - if (likely(ret >= 0)) - iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated @@ -629,34 +607,35 @@ xfs_file_buffered_aio_write( * also behaves as a filter to prevent too many eofblocks scans from * running at the same time. */ - if (ret == -EDQUOT && !enospc) { + if (ret == -EDQUOT && flush_on_enospc) { + int made_progress; + xfs_iunlock(ip, iolock); - enospc = xfs_inode_free_quota_eofblocks(ip); - if (enospc) - goto write_retry; - enospc = xfs_inode_free_quota_cowblocks(ip); - if (enospc) - goto write_retry; - iolock = 0; - } else if (ret == -ENOSPC && !enospc) { + made_progress = xfs_inode_free_quota_eofblocks(ip); + if (made_progress) + return -EAGAIN; + made_progress = xfs_inode_free_quota_cowblocks(ip); + if (made_progress) + return -EAGAIN; + return ret; + } + if (ret == -ENOSPC && flush_on_enospc) { struct xfs_eofblocks eofb = {0}; - enospc = 1; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); xfs_icache_free_cowblocks(ip->i_mount, &eofb); - goto write_retry; + return -EAGAIN; } current->backing_dev_info = NULL; -out: - if (iolock) - xfs_iunlock(ip, iolock); - + xfs_iunlock(ip, iolock); if (ret > 0) { + iocb->ki_pos += ret; + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ ret = generic_write_sync(iocb, ret); @@ -675,6 +654,8 @@ xfs_file_write_iter( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); + int iolock; + bool flush_on_enospc = true; XFS_STATS_INC(ip->i_mount, xs_write_calls); @@ -684,22 +665,66 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (IS_DAX(inode)) - return xfs_file_dax_write(iocb, from); - - if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Set up the initial locking state - only direct IO uses shared locking + * for writes, and buffered IO does not support IOCB_NOWAIT. + */ +relock: + if (IS_DAX(inode)) { + iolock = XFS_IOLOCK_EXCL; + } else if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink * CoW. In all other directio scenarios we do not * allow an operation to fall back to buffered mode. */ - ret = xfs_file_dio_aio_write(iocb, from); - if (ret != -EREMCHG) - return ret; + iolock = xfs_file_dio_write_lock_mode(iocb, from); + if (iolock == -EREMCHG) { + iocb->ki_flags &= ~IOCB_DIRECT; + iolock = XFS_IOLOCK_EXCL; + } + if (iolock < 0) + return iolock; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + /* buffered writes do not support IOCB_NOWAIT */ + return -EOPNOTSUPP; + } else { + iolock = XFS_IOLOCK_EXCL; } - return xfs_file_buffered_aio_write(iocb, from); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) { + xfs_iunlock(ip, iolock); + return ret; + } + + /* + * If the IO mode switched while we were locking meaning we hold + * the wrong lock type right now, start again. + */ + if ((IS_DAX(inode) || !(iocb->ki_flags & IOCB_DIRECT)) && + iolock != XFS_IOLOCK_EXCL) { + xfs_iunlock(ip, iolock); + goto relock; + } + + if (IS_DAX(inode)) { + ret = xfs_file_dax_write(iocb, from, iolock); + } else if (iocb->ki_flags & IOCB_DIRECT) { + ret = xfs_file_dio_aio_write(iocb, from, iolock); + ASSERT(ret != -EREMCHG); + } else { + ret = xfs_file_buffered_aio_write(iocb, from, iolock, + flush_on_enospc); + if (ret == -EAGAIN && flush_on_enospc) { + /* inode already unlocked, but need another attempt */ + flush_on_enospc = false; + goto relock; + } + ASSERT(ret != -EAGAIN); + } + return ret; } static void