Re: [PATCH] ceph: add buffered/direct exclusionary locking for reads and writes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 2019-08-06 at 11:21 +0800, Yan, Zheng wrote:
> On Tue, Aug 6, 2019 at 4:05 AM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > to a file, and then reads back the result using buffered I/O, while
> > running a separate set of tasks that are also doing buffered reads.
> > 
> > The client will invalidate the cache prior to a direct write, but it's
> > easy for one of the other readers' replies to race in and reinstantiate
> > the invalidated range with stale data.
> > 
> > To fix this, we must to serialize direct I/O writes and buffered reads.
> > We could just sprinkle in some shared locks on the i_rwsem for reads,
> > and increase the exclusive footprint on the write side, but that would
> > cause O_DIRECT writes to end up serialized vs. other direct requests.
> > 
> > Instead, borrow the scheme used by nfs.ko. Buffered writes take the
> > i_rwsem exclusively, but buffered reads and all O_DIRECT requests
> > take a shared lock, allowing them to run in parallel.
> > 
> > To fix the bug though, we need buffered reads and O_DIRECT I/O to not
> > run in parallel. A flag on the ceph_inode_info is used to indicate
> > whether it's in direct or buffered I/O mode. When a conflicting request
> > is submitted, it will block until the inode can flip to the necessary
> > mode.
> > 
> > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > ---
> >  fs/ceph/Makefile |   2 +-
> >  fs/ceph/file.c   |  28 ++++++--
> >  fs/ceph/io.c     | 163 +++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ceph/io.h     |  12 ++++
> >  fs/ceph/super.h  |   2 +-
> >  5 files changed, 198 insertions(+), 9 deletions(-)
> >  create mode 100644 fs/ceph/io.c
> >  create mode 100644 fs/ceph/io.h
> > 
> > diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
> > index a699e320393f..c1da294418d1 100644
> > --- a/fs/ceph/Makefile
> > +++ b/fs/ceph/Makefile
> > @@ -6,7 +6,7 @@
> >  obj-$(CONFIG_CEPH_FS) += ceph.o
> > 
> >  ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
> > -       export.o caps.o snap.o xattr.o quota.o \
> > +       export.o caps.o snap.o xattr.o quota.o io.o \
> >         mds_client.o mdsmap.o strings.o ceph_frag.o \
> >         debugfs.o
> > 
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 5182e1a49d6f..d7d264e05303 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -15,6 +15,7 @@
> >  #include "super.h"
> >  #include "mds_client.h"
> >  #include "cache.h"
> > +#include "io.h"
> > 
> >  static __le32 ceph_flags_sys2wire(u32 flags)
> >  {
> > @@ -1284,12 +1285,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > 
> >                 if (ci->i_inline_version == CEPH_INLINE_NONE) {
> >                         if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
> > +                               ceph_start_io_direct(inode);
> >                                 ret = ceph_direct_read_write(iocb, to,
> >                                                              NULL, NULL);
> > +                               ceph_end_io_direct(inode);
> >                                 if (ret >= 0 && ret < len)
> >                                         retry_op = CHECK_EOF;
> >                         } else {
> > +                               ceph_start_io_read(inode);
> >                                 ret = ceph_sync_read(iocb, to, &retry_op);
> > +                               ceph_end_io_read(inode);
> >                         }
> >                 } else {
> >                         retry_op = READ_INLINE;
> > @@ -1300,7 +1305,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >                      inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
> >                      ceph_cap_string(got));
> >                 ceph_add_rw_context(fi, &rw_ctx);
> > +               ceph_start_io_read(inode);
> >                 ret = generic_file_read_iter(iocb, to);
> > +               ceph_end_io_read(inode);
> >                 ceph_del_rw_context(fi, &rw_ctx);
> >         }
> >         dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
> > @@ -1409,7 +1416,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 return -ENOMEM;
> > 
> >  retry_snap:
> > -       inode_lock(inode);
> > +       if (iocb->ki_flags & IOCB_DIRECT)
> > +               ceph_start_io_direct(inode);
> > +       else
> > +               ceph_start_io_write(inode);
> > 
> >         /* We can write back this queue in page reclaim */
> >         current->backing_dev_info = inode_to_bdi(inode);
> > @@ -1480,7 +1490,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >             (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
> >                 struct ceph_snap_context *snapc;
> >                 struct iov_iter data;
> > -               inode_unlock(inode);
> > 
> >                 spin_lock(&ci->i_ceph_lock);
> >                 if (__ceph_have_pending_cap_snap(ci)) {
> > @@ -1497,11 +1506,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > 
> >                 /* we might need to revert back to that point */
> >                 data = *from;
> > -               if (iocb->ki_flags & IOCB_DIRECT)
> > +               if (iocb->ki_flags & IOCB_DIRECT) {
> >                         written = ceph_direct_read_write(iocb, &data, snapc,
> >                                                          &prealloc_cf);
> > -               else
> > +                       ceph_end_io_direct(inode);
> > +               } else {
> >                         written = ceph_sync_write(iocb, &data, pos, snapc);
> > +                       ceph_end_io_write(inode);
> > +               }
> >                 if (written > 0)
> >                         iov_iter_advance(from, written);
> >                 ceph_put_snap_context(snapc);
> > @@ -1516,7 +1528,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 written = generic_perform_write(file, from, pos);
> >                 if (likely(written >= 0))
> >                         iocb->ki_pos = pos + written;
> > -               inode_unlock(inode);
> > +               ceph_end_io_write(inode);
> >         }
> > 
> >         if (written >= 0) {
> > @@ -1551,9 +1563,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >         }
> > 
> >         goto out_unlocked;
> > -
> >  out:
> > -       inode_unlock(inode);
> > +       if (iocb->ki_flags & IOCB_DIRECT)
> > +               ceph_end_io_direct(inode);
> > +       else
> > +               ceph_end_io_write(inode);
> >  out_unlocked:
> >         ceph_free_cap_flush(prealloc_cf);
> >         current->backing_dev_info = NULL;
> > diff --git a/fs/ceph/io.c b/fs/ceph/io.c
> > new file mode 100644
> > index 000000000000..8367cbcc81e8
> > --- /dev/null
> > +++ b/fs/ceph/io.c
> > @@ -0,0 +1,163 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (c) 2016 Trond Myklebust
> > + * Copyright (c) 2019 Jeff Layton
> > + *
> > + * I/O and data path helper functionality.
> > + *
> > + * Heavily borrowed from equivalent code in fs/nfs/io.c
> > + */
> > +
> > +#include <linux/ceph/ceph_debug.h>
> > +
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/rwsem.h>
> > +#include <linux/fs.h>
> > +
> > +#include "super.h"
> > +#include "io.h"
> > +
> > +/* Call with exclusively locked inode->i_rwsem */
> > +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
> > +{
> > +       lockdep_assert_held_write(&inode->i_rwsem);
> > +
> > +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
> > +               spin_lock(&ci->i_ceph_lock);
> > +               ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
> > +               spin_unlock(&ci->i_ceph_lock);
> > +               inode_dio_wait(inode);
> > +       }
> > +}
> > +
> > +/**
> > + * ceph_start_io_read - declare the file is being used for buffered reads
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered read operation is about to start, and ensure
> > + * that we block all direct I/O.
> > + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
> > + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> > + * cannot be changed.
> > + * In practice, this means that buffered read operations are allowed to
> > + * execute in parallel, thanks to the shared lock, whereas direct I/O
> > + * operations need to wait to grab an exclusive lock in order to set
> > + * CEPH_I_ODIRECT.
> > + * Note that buffered writes and truncates both take a write lock on
> > + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
> > + */
> > +void
> > +ceph_start_io_read(struct inode *inode)
> > +{
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +       /* Be an optimist! */
> > +       down_read(&inode->i_rwsem);
> > +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
> > +               return;
> > +       up_read(&inode->i_rwsem);
> > +       /* Slow path.... */
> > +       down_write(&inode->i_rwsem);
> > +       ceph_block_o_direct(ci, inode);
> > +       downgrade_write(&inode->i_rwsem);
> > +}
> > +
> > +/**
> > + * ceph_end_io_read - declare that the buffered read operation is done
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered read operation is done, and release the shared
> > + * lock on inode->i_rwsem.
> > + */
> > +void
> > +ceph_end_io_read(struct inode *inode)
> > +{
> > +       up_read(&inode->i_rwsem);
> > +}
> > +
> > +/**
> > + * ceph_start_io_write - declare the file is being used for buffered writes
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered read operation is about to start, and ensure
> > + * that we block all direct I/O.
> > + */
> > +void
> > +ceph_start_io_write(struct inode *inode)
> > +{
> > +       down_write(&inode->i_rwsem);
> > +       ceph_block_o_direct(ceph_inode(inode), inode);
> > +}
> > +
> > +/**
> > + * ceph_end_io_write - declare that the buffered write operation is done
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered write operation is done, and release the
> > + * lock on inode->i_rwsem.
> > + */
> > +void
> > +ceph_end_io_write(struct inode *inode)
> > +{
> > +       up_write(&inode->i_rwsem);
> > +}
> > +
> > +/* Call with exclusively locked inode->i_rwsem */
> > +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
> > +{
> > +       lockdep_assert_held_write(&inode->i_rwsem);
> > +
> > +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
> > +               spin_lock(&ci->i_ceph_lock);
> > +               ci->i_ceph_flags |= CEPH_I_ODIRECT;
> > +               spin_unlock(&ci->i_ceph_lock);
> > +               /* FIXME: unmap_mapping_range? */
> > +               filemap_write_and_wait(inode->i_mapping);
> > +       }
> > +}
> > +
> > +/**
> > + * ceph_end_io_direct - declare the file is being used for direct i/o
> > + * @inode: file inode
> > + *
> > + * Declare that a direct I/O operation is about to start, and ensure
> > + * that we block all buffered I/O.
> > + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
> > + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> > + * cannot be changed.
> > + * In practice, this means that direct I/O operations are allowed to
> > + * execute in parallel, thanks to the shared lock, whereas buffered I/O
> > + * operations need to wait to grab an exclusive lock in order to clear
> > + * CEPH_I_ODIRECT.
> > + * Note that buffered writes and truncates both take a write lock on
> > + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
> > + */
> > +void
> > +ceph_start_io_direct(struct inode *inode)
> > +{
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +       /* Be an optimist! */
> > +       down_read(&inode->i_rwsem);
> > +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
> > +               return;
> > +       up_read(&inode->i_rwsem);
> > +       /* Slow path.... */
> > +       down_write(&inode->i_rwsem);
> > +       ceph_block_buffered(ci, inode);
> > +       downgrade_write(&inode->i_rwsem);
> > +}
> > +
> > +/**
> > + * ceph_end_io_direct - declare that the direct i/o operation is done
> > + * @inode: file inode
> > + *
> > + * Declare that a direct I/O operation is done, and release the shared
> > + * lock on inode->i_rwsem.
> > + */
> > +void
> > +ceph_end_io_direct(struct inode *inode)
> > +{
> > +       up_read(&inode->i_rwsem);
> > +}
> > diff --git a/fs/ceph/io.h b/fs/ceph/io.h
> > new file mode 100644
> > index 000000000000..fa594cd77348
> > --- /dev/null
> > +++ b/fs/ceph/io.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _FS_CEPH_IO_H
> > +#define _FS_CEPH_IO_H
> > +
> > +void ceph_start_io_read(struct inode *inode);
> > +void ceph_end_io_read(struct inode *inode);
> > +void ceph_start_io_write(struct inode *inode);
> > +void ceph_end_io_write(struct inode *inode);
> > +void ceph_start_io_direct(struct inode *inode);
> > +void ceph_end_io_direct(struct inode *inode);
> > +
> > +#endif /* FS_CEPH_IO_H */
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 03e4828c7635..46a64293a3f7 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -516,7 +516,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> >  #define CEPH_I_FLUSH_SNAPS     (1 << 9) /* need flush snapss */
> >  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> >  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> > -
> > +#define CEPH_I_ODIRECT         (1 << 12) /* active direct I/O in progress */
> > 
> >  /*
> >   * Masks of ceph inode work.
> > --
> > 2.21.0
> > 
> Reviewd-by

Thanks, Zheng.

I cleaned up the comments and changelog, and added another delta to
remove the filemap_write_and_wait_range() call at the top of
ceph_direct_read_write() since it's no longer needed.

I've pushed the result to ceph-client/testing. Let me know if you hear
of any regressions with this (performance regressions in particular).
-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux