On Tue, 2019-08-06 at 11:21 +0800, Yan, Zheng wrote: > On Tue, Aug 6, 2019 at 4:05 AM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > xfstest generic/451 intermittently fails. The test does O_DIRECT writes > > to a file, and then reads back the result using buffered I/O, while > > running a separate set of tasks that are also doing buffered reads. > > > > The client will invalidate the cache prior to a direct write, but it's > > easy for one of the other readers' replies to race in and reinstantiate > > the invalidated range with stale data. > > > > To fix this, we must to serialize direct I/O writes and buffered reads. > > We could just sprinkle in some shared locks on the i_rwsem for reads, > > and increase the exclusive footprint on the write side, but that would > > cause O_DIRECT writes to end up serialized vs. other direct requests. > > > > Instead, borrow the scheme used by nfs.ko. Buffered writes take the > > i_rwsem exclusively, but buffered reads and all O_DIRECT requests > > take a shared lock, allowing them to run in parallel. > > > > To fix the bug though, we need buffered reads and O_DIRECT I/O to not > > run in parallel. A flag on the ceph_inode_info is used to indicate > > whether it's in direct or buffered I/O mode. When a conflicting request > > is submitted, it will block until the inode can flip to the necessary > > mode. > > > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> > > --- > > fs/ceph/Makefile | 2 +- > > fs/ceph/file.c | 28 ++++++-- > > fs/ceph/io.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++ > > fs/ceph/io.h | 12 ++++ > > fs/ceph/super.h | 2 +- > > 5 files changed, 198 insertions(+), 9 deletions(-) > > create mode 100644 fs/ceph/io.c > > create mode 100644 fs/ceph/io.h > > > > diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile > > index a699e320393f..c1da294418d1 100644 > > --- a/fs/ceph/Makefile > > +++ b/fs/ceph/Makefile > > @@ -6,7 +6,7 @@ > > obj-$(CONFIG_CEPH_FS) += ceph.o > > > > ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ > > - export.o caps.o snap.o xattr.o quota.o \ > > + export.o caps.o snap.o xattr.o quota.o io.o \ > > mds_client.o mdsmap.o strings.o ceph_frag.o \ > > debugfs.o > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > index 5182e1a49d6f..d7d264e05303 100644 > > --- a/fs/ceph/file.c > > +++ b/fs/ceph/file.c > > @@ -15,6 +15,7 @@ > > #include "super.h" > > #include "mds_client.h" > > #include "cache.h" > > +#include "io.h" > > > > static __le32 ceph_flags_sys2wire(u32 flags) > > { > > @@ -1284,12 +1285,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > > if (ci->i_inline_version == CEPH_INLINE_NONE) { > > if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { > > + ceph_start_io_direct(inode); > > ret = ceph_direct_read_write(iocb, to, > > NULL, NULL); > > + ceph_end_io_direct(inode); > > if (ret >= 0 && ret < len) > > retry_op = CHECK_EOF; > > } else { > > + ceph_start_io_read(inode); > > ret = ceph_sync_read(iocb, to, &retry_op); > > + ceph_end_io_read(inode); > > } > > } else { > > retry_op = READ_INLINE; > > @@ -1300,7 +1305,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) > > inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, > > ceph_cap_string(got)); > > ceph_add_rw_context(fi, &rw_ctx); > > + ceph_start_io_read(inode); > > ret = generic_file_read_iter(iocb, to); > > + ceph_end_io_read(inode); > > ceph_del_rw_context(fi, &rw_ctx); > > } > > dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", > > @@ -1409,7 +1416,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) > > return -ENOMEM; > > > > retry_snap: > > - inode_lock(inode); > > + if (iocb->ki_flags & IOCB_DIRECT) > > + ceph_start_io_direct(inode); > > + else > > + ceph_start_io_write(inode); > > > > /* We can write back this queue in page reclaim */ > > current->backing_dev_info = inode_to_bdi(inode); > > @@ -1480,7 +1490,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) > > (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { > > struct ceph_snap_context *snapc; > > struct iov_iter data; > > - inode_unlock(inode); > > > > spin_lock(&ci->i_ceph_lock); > > if (__ceph_have_pending_cap_snap(ci)) { > > @@ -1497,11 +1506,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) > > > > /* we might need to revert back to that point */ > > data = *from; > > - if (iocb->ki_flags & IOCB_DIRECT) > > + if (iocb->ki_flags & IOCB_DIRECT) { > > written = ceph_direct_read_write(iocb, &data, snapc, > > &prealloc_cf); > > - else > > + ceph_end_io_direct(inode); > > + } else { > > written = ceph_sync_write(iocb, &data, pos, snapc); > > + ceph_end_io_write(inode); > > + } > > if (written > 0) > > iov_iter_advance(from, written); > > ceph_put_snap_context(snapc); > > @@ -1516,7 +1528,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) > > written = generic_perform_write(file, from, pos); > > if (likely(written >= 0)) > > iocb->ki_pos = pos + written; > > - inode_unlock(inode); > > + ceph_end_io_write(inode); > > } > > > > if (written >= 0) { > > @@ -1551,9 +1563,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) > > } > > > > goto out_unlocked; > > - > > out: > > - inode_unlock(inode); > > + if (iocb->ki_flags & IOCB_DIRECT) > > + ceph_end_io_direct(inode); > > + else > > + ceph_end_io_write(inode); > > out_unlocked: > > ceph_free_cap_flush(prealloc_cf); > > current->backing_dev_info = NULL; > > diff --git a/fs/ceph/io.c b/fs/ceph/io.c > > new file mode 100644 > > index 000000000000..8367cbcc81e8 > > --- /dev/null > > +++ b/fs/ceph/io.c > > @@ -0,0 +1,163 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * Copyright (c) 2016 Trond Myklebust > > + * Copyright (c) 2019 Jeff Layton > > + * > > + * I/O and data path helper functionality. > > + * > > + * Heavily borrowed from equivalent code in fs/nfs/io.c > > + */ > > + > > +#include <linux/ceph/ceph_debug.h> > > + > > +#include <linux/types.h> > > +#include <linux/kernel.h> > > +#include <linux/rwsem.h> > > +#include <linux/fs.h> > > + > > +#include "super.h" > > +#include "io.h" > > + > > +/* Call with exclusively locked inode->i_rwsem */ > > +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) > > +{ > > + lockdep_assert_held_write(&inode->i_rwsem); > > + > > + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { > > + spin_lock(&ci->i_ceph_lock); > > + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; > > + spin_unlock(&ci->i_ceph_lock); > > + inode_dio_wait(inode); > > + } > > +} > > + > > +/** > > + * ceph_start_io_read - declare the file is being used for buffered reads > > + * @inode: file inode > > + * > > + * Declare that a buffered read operation is about to start, and ensure > > + * that we block all direct I/O. > > + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, > > + * and holds a shared lock on inode->i_rwsem to ensure that the flag > > + * cannot be changed. > > + * In practice, this means that buffered read operations are allowed to > > + * execute in parallel, thanks to the shared lock, whereas direct I/O > > + * operations need to wait to grab an exclusive lock in order to set > > + * CEPH_I_ODIRECT. > > + * Note that buffered writes and truncates both take a write lock on > > + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. > > + */ > > +void > > +ceph_start_io_read(struct inode *inode) > > +{ > > + struct ceph_inode_info *ci = ceph_inode(inode); > > + > > + /* Be an optimist! */ > > + down_read(&inode->i_rwsem); > > + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) > > + return; > > + up_read(&inode->i_rwsem); > > + /* Slow path.... */ > > + down_write(&inode->i_rwsem); > > + ceph_block_o_direct(ci, inode); > > + downgrade_write(&inode->i_rwsem); > > +} > > + > > +/** > > + * ceph_end_io_read - declare that the buffered read operation is done > > + * @inode: file inode > > + * > > + * Declare that a buffered read operation is done, and release the shared > > + * lock on inode->i_rwsem. > > + */ > > +void > > +ceph_end_io_read(struct inode *inode) > > +{ > > + up_read(&inode->i_rwsem); > > +} > > + > > +/** > > + * ceph_start_io_write - declare the file is being used for buffered writes > > + * @inode: file inode > > + * > > + * Declare that a buffered read operation is about to start, and ensure > > + * that we block all direct I/O. > > + */ > > +void > > +ceph_start_io_write(struct inode *inode) > > +{ > > + down_write(&inode->i_rwsem); > > + ceph_block_o_direct(ceph_inode(inode), inode); > > +} > > + > > +/** > > + * ceph_end_io_write - declare that the buffered write operation is done > > + * @inode: file inode > > + * > > + * Declare that a buffered write operation is done, and release the > > + * lock on inode->i_rwsem. > > + */ > > +void > > +ceph_end_io_write(struct inode *inode) > > +{ > > + up_write(&inode->i_rwsem); > > +} > > + > > +/* Call with exclusively locked inode->i_rwsem */ > > +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) > > +{ > > + lockdep_assert_held_write(&inode->i_rwsem); > > + > > + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { > > + spin_lock(&ci->i_ceph_lock); > > + ci->i_ceph_flags |= CEPH_I_ODIRECT; > > + spin_unlock(&ci->i_ceph_lock); > > + /* FIXME: unmap_mapping_range? */ > > + filemap_write_and_wait(inode->i_mapping); > > + } > > +} > > + > > +/** > > + * ceph_end_io_direct - declare the file is being used for direct i/o > > + * @inode: file inode > > + * > > + * Declare that a direct I/O operation is about to start, and ensure > > + * that we block all buffered I/O. > > + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, > > + * and holds a shared lock on inode->i_rwsem to ensure that the flag > > + * cannot be changed. > > + * In practice, this means that direct I/O operations are allowed to > > + * execute in parallel, thanks to the shared lock, whereas buffered I/O > > + * operations need to wait to grab an exclusive lock in order to clear > > + * CEPH_I_ODIRECT. > > + * Note that buffered writes and truncates both take a write lock on > > + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. > > + */ > > +void > > +ceph_start_io_direct(struct inode *inode) > > +{ > > + struct ceph_inode_info *ci = ceph_inode(inode); > > + > > + /* Be an optimist! */ > > + down_read(&inode->i_rwsem); > > + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) > > + return; > > + up_read(&inode->i_rwsem); > > + /* Slow path.... */ > > + down_write(&inode->i_rwsem); > > + ceph_block_buffered(ci, inode); > > + downgrade_write(&inode->i_rwsem); > > +} > > + > > +/** > > + * ceph_end_io_direct - declare that the direct i/o operation is done > > + * @inode: file inode > > + * > > + * Declare that a direct I/O operation is done, and release the shared > > + * lock on inode->i_rwsem. > > + */ > > +void > > +ceph_end_io_direct(struct inode *inode) > > +{ > > + up_read(&inode->i_rwsem); > > +} > > diff --git a/fs/ceph/io.h b/fs/ceph/io.h > > new file mode 100644 > > index 000000000000..fa594cd77348 > > --- /dev/null > > +++ b/fs/ceph/io.h > > @@ -0,0 +1,12 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +#ifndef _FS_CEPH_IO_H > > +#define _FS_CEPH_IO_H > > + > > +void ceph_start_io_read(struct inode *inode); > > +void ceph_end_io_read(struct inode *inode); > > +void ceph_start_io_write(struct inode *inode); > > +void ceph_end_io_write(struct inode *inode); > > +void ceph_start_io_direct(struct inode *inode); > > +void ceph_end_io_direct(struct inode *inode); > > + > > +#endif /* FS_CEPH_IO_H */ > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > index 03e4828c7635..46a64293a3f7 100644 > > --- a/fs/ceph/super.h > > +++ b/fs/ceph/super.h > > @@ -516,7 +516,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, > > #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ > > #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ > > #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ > > - > > +#define CEPH_I_ODIRECT (1 << 12) /* active direct I/O in progress */ > > > > /* > > * Masks of ceph inode work. > > -- > > 2.21.0 > > > Reviewd-by Thanks, Zheng. I cleaned up the comments and changelog, and added another delta to remove the filemap_write_and_wait_range() call at the top of ceph_direct_read_write() since it's no longer needed. I've pushed the result to ceph-client/testing. Let me know if you hear of any regressions with this (performance regressions in particular). -- Jeff Layton <jlayton@xxxxxxxxxx>