The direct I/O code currently uses a hand crafted i_dio_count that needs to be incremented under i_rwsem and then is decremented when I/O completes. That scheme means file system code needs to be very careful to wait for i_dio_count to reach zero under i_rwsem in various places that are very cumbersome to get rid. It also means we can't get the effect of an exclusive i_rwsem for actually asynchronous I/O, forcing pointless synchronous execution of sub-blocksize writes. Replace the i_dio_count scheme with holding i_rwsem over the duration of the whole I/O. While this introduces a non-owner unlock that isn't nice to RT workload, the open coded locking primitive using i_dio_count isn't any better. Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- fs/iomap/direct-io.c | 44 +++++++++++++++++++++++++++++++++++++------ include/linux/iomap.h | 2 ++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e706329d71a0..0113ac33b0a0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -70,7 +70,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, dio->submit.cookie = submit_bio(bio); } -static ssize_t iomap_dio_complete(struct iomap_dio *dio) +static ssize_t iomap_dio_complete(struct iomap_dio *dio, bool unlock) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; @@ -112,6 +112,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + if (unlock) { + if (dio->flags & IOMAP_DIO_RWSEM_EXCL) + up_write(&inode->i_rwsem); + else if (dio->flags & IOMAP_DIO_RWSEM_SHARED) + up_read(&inode->i_rwsem); + } + /* * If this is a DSYNC write, make sure we push it to stable storage now * that we've written data. @@ -129,8 +136,22 @@ static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; + struct inode *inode = file_inode(iocb->ki_filp); - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); + /* + * XXX: For reads this code is directly called from bio ->end_io, which + * often is hard or softirq context. In that case lockdep records the + * below as lock acquisitions from irq context and causes warnings. + */ + if (dio->flags & IOMAP_DIO_RWSEM_EXCL) { + rwsem_acquire(&inode->i_rwsem.dep_map, 0, 0, _THIS_IP_); + if (IS_ENABLED(CONFIG_RWSEM_SPIN_ON_OWNER)) + atomic_long_set(&inode->i_rwsem.owner, (long)current); + } else if (dio->flags & IOMAP_DIO_RWSEM_SHARED) { + rwsem_acquire_read(&inode->i_rwsem.dep_map, 0, 0, _THIS_IP_); + } + + iocb->ki_complete(iocb, iomap_dio_complete(dio, true), 0); } /* @@ -430,7 +451,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->i_size = i_size_read(inode); dio->dops = dops; dio->error = 0; - dio->flags = 0; + dio->flags = dio_flags; dio->submit.iter = iter; dio->submit.waiter = current; @@ -551,8 +572,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { if (!wait_for_completion) - return -EIOCBQUEUED; - + goto async_completion; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->submit.waiter)) @@ -567,10 +587,22 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - return iomap_dio_complete(dio); + return iomap_dio_complete(dio, false); out_free_dio: kfree(dio); return ret; + +async_completion: + /* + * We are returning to userspace now, but i_rwsem is still held until + * the I/O completion comes back. + */ + if (dio_flags & (IOMAP_DIO_RWSEM_EXCL | IOMAP_DIO_RWSEM_SHARED)) + rwsem_release(&inode->i_rwsem.dep_map, _THIS_IP_); + if ((dio_flags & IOMAP_DIO_RWSEM_EXCL) && + IS_ENABLED(CONFIG_RWSEM_SPIN_ON_OWNER)) + atomic_long_set(&inode->i_rwsem.owner, RWSEM_OWNER_UNKNOWN); + return -EIOCBQUEUED; } EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3faeb8fd0961..f259bb979d7f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -249,6 +249,8 @@ int iomap_writepages(struct address_space *mapping, #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ #define IOMAP_DIO_SYNCHRONOUS (1 << 2) /* no async completion */ +#define IOMAP_DIO_RWSEM_EXCL (1 << 3) /* holds shared i_rwsem */ +#define IOMAP_DIO_RWSEM_SHARED (1 << 4) /* holds exclusive i_rwsem */ struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, -- 2.24.1