Re: [PATCH] ceph: add buffered/direct exclusionary locking for reads and writes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Aug 6, 2019 at 4:05 AM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>
> xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> to a file, and then reads back the result using buffered I/O, while
> running a separate set of tasks that are also doing buffered reads.
>
> The client will invalidate the cache prior to a direct write, but it's
> easy for one of the other readers' replies to race in and reinstantiate
> the invalidated range with stale data.
>
> To fix this, we must to serialize direct I/O writes and buffered reads.
> We could just sprinkle in some shared locks on the i_rwsem for reads,
> and increase the exclusive footprint on the write side, but that would
> cause O_DIRECT writes to end up serialized vs. other direct requests.
>
> Instead, borrow the scheme used by nfs.ko. Buffered writes take the
> i_rwsem exclusively, but buffered reads and all O_DIRECT requests
> take a shared lock, allowing them to run in parallel.
>
> To fix the bug though, we need buffered reads and O_DIRECT I/O to not
> run in parallel. A flag on the ceph_inode_info is used to indicate
> whether it's in direct or buffered I/O mode. When a conflicting request
> is submitted, it will block until the inode can flip to the necessary
> mode.
>
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
>  fs/ceph/Makefile |   2 +-
>  fs/ceph/file.c   |  28 ++++++--
>  fs/ceph/io.c     | 163 +++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/io.h     |  12 ++++
>  fs/ceph/super.h  |   2 +-
>  5 files changed, 198 insertions(+), 9 deletions(-)
>  create mode 100644 fs/ceph/io.c
>  create mode 100644 fs/ceph/io.h
>
> diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
> index a699e320393f..c1da294418d1 100644
> --- a/fs/ceph/Makefile
> +++ b/fs/ceph/Makefile
> @@ -6,7 +6,7 @@
>  obj-$(CONFIG_CEPH_FS) += ceph.o
>
>  ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
> -       export.o caps.o snap.o xattr.o quota.o \
> +       export.o caps.o snap.o xattr.o quota.o io.o \
>         mds_client.o mdsmap.o strings.o ceph_frag.o \
>         debugfs.o
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 5182e1a49d6f..d7d264e05303 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -15,6 +15,7 @@
>  #include "super.h"
>  #include "mds_client.h"
>  #include "cache.h"
> +#include "io.h"
>
>  static __le32 ceph_flags_sys2wire(u32 flags)
>  {
> @@ -1284,12 +1285,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>
>                 if (ci->i_inline_version == CEPH_INLINE_NONE) {
>                         if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
> +                               ceph_start_io_direct(inode);
>                                 ret = ceph_direct_read_write(iocb, to,
>                                                              NULL, NULL);
> +                               ceph_end_io_direct(inode);
>                                 if (ret >= 0 && ret < len)
>                                         retry_op = CHECK_EOF;
>                         } else {
> +                               ceph_start_io_read(inode);
>                                 ret = ceph_sync_read(iocb, to, &retry_op);
> +                               ceph_end_io_read(inode);
>                         }
>                 } else {
>                         retry_op = READ_INLINE;
> @@ -1300,7 +1305,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>                      inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
>                      ceph_cap_string(got));
>                 ceph_add_rw_context(fi, &rw_ctx);
> +               ceph_start_io_read(inode);
>                 ret = generic_file_read_iter(iocb, to);
> +               ceph_end_io_read(inode);
>                 ceph_del_rw_context(fi, &rw_ctx);
>         }
>         dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
> @@ -1409,7 +1416,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 return -ENOMEM;
>
>  retry_snap:
> -       inode_lock(inode);
> +       if (iocb->ki_flags & IOCB_DIRECT)
> +               ceph_start_io_direct(inode);
> +       else
> +               ceph_start_io_write(inode);
>
>         /* We can write back this queue in page reclaim */
>         current->backing_dev_info = inode_to_bdi(inode);
> @@ -1480,7 +1490,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>             (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
>                 struct ceph_snap_context *snapc;
>                 struct iov_iter data;
> -               inode_unlock(inode);
>
>                 spin_lock(&ci->i_ceph_lock);
>                 if (__ceph_have_pending_cap_snap(ci)) {
> @@ -1497,11 +1506,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>
>                 /* we might need to revert back to that point */
>                 data = *from;
> -               if (iocb->ki_flags & IOCB_DIRECT)
> +               if (iocb->ki_flags & IOCB_DIRECT) {
>                         written = ceph_direct_read_write(iocb, &data, snapc,
>                                                          &prealloc_cf);
> -               else
> +                       ceph_end_io_direct(inode);
> +               } else {
>                         written = ceph_sync_write(iocb, &data, pos, snapc);
> +                       ceph_end_io_write(inode);
> +               }
>                 if (written > 0)
>                         iov_iter_advance(from, written);
>                 ceph_put_snap_context(snapc);
> @@ -1516,7 +1528,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 written = generic_perform_write(file, from, pos);
>                 if (likely(written >= 0))
>                         iocb->ki_pos = pos + written;
> -               inode_unlock(inode);
> +               ceph_end_io_write(inode);
>         }
>
>         if (written >= 0) {
> @@ -1551,9 +1563,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         }
>
>         goto out_unlocked;
> -
>  out:
> -       inode_unlock(inode);
> +       if (iocb->ki_flags & IOCB_DIRECT)
> +               ceph_end_io_direct(inode);
> +       else
> +               ceph_end_io_write(inode);
>  out_unlocked:
>         ceph_free_cap_flush(prealloc_cf);
>         current->backing_dev_info = NULL;
> diff --git a/fs/ceph/io.c b/fs/ceph/io.c
> new file mode 100644
> index 000000000000..8367cbcc81e8
> --- /dev/null
> +++ b/fs/ceph/io.c
> @@ -0,0 +1,163 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2016 Trond Myklebust
> + * Copyright (c) 2019 Jeff Layton
> + *
> + * I/O and data path helper functionality.
> + *
> + * Heavily borrowed from equivalent code in fs/nfs/io.c
> + */
> +
> +#include <linux/ceph/ceph_debug.h>
> +
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/rwsem.h>
> +#include <linux/fs.h>
> +
> +#include "super.h"
> +#include "io.h"
> +
> +/* Call with exclusively locked inode->i_rwsem */
> +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
> +{
> +       lockdep_assert_held_write(&inode->i_rwsem);
> +
> +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
> +               spin_lock(&ci->i_ceph_lock);
> +               ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
> +               spin_unlock(&ci->i_ceph_lock);
> +               inode_dio_wait(inode);
> +       }
> +}
> +
> +/**
> + * ceph_start_io_read - declare the file is being used for buffered reads
> + * @inode: file inode
> + *
> + * Declare that a buffered read operation is about to start, and ensure
> + * that we block all direct I/O.
> + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
> + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> + * cannot be changed.
> + * In practice, this means that buffered read operations are allowed to
> + * execute in parallel, thanks to the shared lock, whereas direct I/O
> + * operations need to wait to grab an exclusive lock in order to set
> + * CEPH_I_ODIRECT.
> + * Note that buffered writes and truncates both take a write lock on
> + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
> + */
> +void
> +ceph_start_io_read(struct inode *inode)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       /* Be an optimist! */
> +       down_read(&inode->i_rwsem);
> +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
> +               return;
> +       up_read(&inode->i_rwsem);
> +       /* Slow path.... */
> +       down_write(&inode->i_rwsem);
> +       ceph_block_o_direct(ci, inode);
> +       downgrade_write(&inode->i_rwsem);
> +}
> +
> +/**
> + * ceph_end_io_read - declare that the buffered read operation is done
> + * @inode: file inode
> + *
> + * Declare that a buffered read operation is done, and release the shared
> + * lock on inode->i_rwsem.
> + */
> +void
> +ceph_end_io_read(struct inode *inode)
> +{
> +       up_read(&inode->i_rwsem);
> +}
> +
> +/**
> + * ceph_start_io_write - declare the file is being used for buffered writes
> + * @inode: file inode
> + *
> + * Declare that a buffered read operation is about to start, and ensure
> + * that we block all direct I/O.
> + */
> +void
> +ceph_start_io_write(struct inode *inode)
> +{
> +       down_write(&inode->i_rwsem);
> +       ceph_block_o_direct(ceph_inode(inode), inode);
> +}
> +
> +/**
> + * ceph_end_io_write - declare that the buffered write operation is done
> + * @inode: file inode
> + *
> + * Declare that a buffered write operation is done, and release the
> + * lock on inode->i_rwsem.
> + */
> +void
> +ceph_end_io_write(struct inode *inode)
> +{
> +       up_write(&inode->i_rwsem);
> +}
> +
> +/* Call with exclusively locked inode->i_rwsem */
> +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
> +{
> +       lockdep_assert_held_write(&inode->i_rwsem);
> +
> +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
> +               spin_lock(&ci->i_ceph_lock);
> +               ci->i_ceph_flags |= CEPH_I_ODIRECT;
> +               spin_unlock(&ci->i_ceph_lock);
> +               /* FIXME: unmap_mapping_range? */
> +               filemap_write_and_wait(inode->i_mapping);
> +       }
> +}
> +
> +/**
> + * ceph_end_io_direct - declare the file is being used for direct i/o
> + * @inode: file inode
> + *
> + * Declare that a direct I/O operation is about to start, and ensure
> + * that we block all buffered I/O.
> + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
> + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> + * cannot be changed.
> + * In practice, this means that direct I/O operations are allowed to
> + * execute in parallel, thanks to the shared lock, whereas buffered I/O
> + * operations need to wait to grab an exclusive lock in order to clear
> + * CEPH_I_ODIRECT.
> + * Note that buffered writes and truncates both take a write lock on
> + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
> + */
> +void
> +ceph_start_io_direct(struct inode *inode)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       /* Be an optimist! */
> +       down_read(&inode->i_rwsem);
> +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
> +               return;
> +       up_read(&inode->i_rwsem);
> +       /* Slow path.... */
> +       down_write(&inode->i_rwsem);
> +       ceph_block_buffered(ci, inode);
> +       downgrade_write(&inode->i_rwsem);
> +}
> +
> +/**
> + * ceph_end_io_direct - declare that the direct i/o operation is done
> + * @inode: file inode
> + *
> + * Declare that a direct I/O operation is done, and release the shared
> + * lock on inode->i_rwsem.
> + */
> +void
> +ceph_end_io_direct(struct inode *inode)
> +{
> +       up_read(&inode->i_rwsem);
> +}
> diff --git a/fs/ceph/io.h b/fs/ceph/io.h
> new file mode 100644
> index 000000000000..fa594cd77348
> --- /dev/null
> +++ b/fs/ceph/io.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _FS_CEPH_IO_H
> +#define _FS_CEPH_IO_H
> +
> +void ceph_start_io_read(struct inode *inode);
> +void ceph_end_io_read(struct inode *inode);
> +void ceph_start_io_write(struct inode *inode);
> +void ceph_end_io_write(struct inode *inode);
> +void ceph_start_io_direct(struct inode *inode);
> +void ceph_end_io_direct(struct inode *inode);
> +
> +#endif /* FS_CEPH_IO_H */
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 03e4828c7635..46a64293a3f7 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -516,7 +516,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>  #define CEPH_I_FLUSH_SNAPS     (1 << 9) /* need flush snapss */
>  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
>  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> -
> +#define CEPH_I_ODIRECT         (1 << 12) /* active direct I/O in progress */
>
>  /*
>   * Masks of ceph inode work.
> --
> 2.21.0
>
Reviewd-by



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux