Re: [PATCH 5/7] NFS: Convert lookups of the open context to RCU

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Trond,

This patch ends up fixing a problem related to recall of delegations.
This problem has existed in the kernel for quite awhile (early RHEL
3.10-based release show the problem, haven't tried 2.6-based).

If you agree with what I present, then can the commit message for this
patch be changed to reflect that it solves this problem?

Problem can be seen by running "nfstest_delegation --runtests
recall26". Jorge can probably describe the test better but I believe
the main client machine opens a file for read and gets a read
delegation to it and locks it, then from a different process in the
same machine there is an appending open (open for write is sent and
gets a write delegation). From a different machine another client
opens the same file which triggers a CB_RECALL. The main client,
un-patched, ends up simply returning the delegation and never recovers
the lock. This is pretty bad as it's possible data corruption case.

In the nfs_delegation_claim_locks() this condition --  if
(nfs_file_open_context(fl->fl_file) != ctx) --  is true when it should
have been false.

There seems to be a disagreement between what's in
nfs_file_open_context(fl->fl_file) and what
get_nfs_open_context(inode's ctx) returns but I don't think how this
patch fixes it. But it does.
On Wed, Sep 5, 2018 at 3:27 PM Trond Myklebust <trondmy@xxxxxxxxx> wrote:
>
> Reduce contention on the inode->i_lock by ensuring that we use RCU
> when looking up the NFS open context.
>
> Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> ---
>  fs/nfs/delegation.c    | 11 ++++++-----
>  fs/nfs/inode.c         | 35 +++++++++++++++--------------------
>  fs/nfs/nfs4proc.c      | 30 ++++++++++++++++++++++++------
>  fs/nfs/nfs4state.c     | 12 ++++++------
>  fs/nfs/pnfs.c          |  5 ++++-
>  include/linux/nfs_fs.h |  1 +
>  6 files changed, 56 insertions(+), 38 deletions(-)
>
> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
> index f033f3a69a3b..76d205d1c7bc 100644
> --- a/fs/nfs/delegation.c
> +++ b/fs/nfs/delegation.c
> @@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct inode *inode,
>         int err;
>
>  again:
> -       spin_lock(&inode->i_lock);
> -       list_for_each_entry(ctx, &nfsi->open_files, list) {
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
>                 state = ctx->state;
>                 if (state == NULL)
>                         continue;
> @@ -147,8 +147,9 @@ static int nfs_delegation_claim_opens(struct inode *inode,
>                         continue;
>                 if (!nfs4_stateid_match(&state->stateid, stateid))
>                         continue;
> -               get_nfs_open_context(ctx);
> -               spin_unlock(&inode->i_lock);
> +               if (!get_nfs_open_context(ctx))
> +                       continue;
> +               rcu_read_unlock();
>                 sp = state->owner;
>                 /* Block nfs4_proc_unlck */
>                 mutex_lock(&sp->so_delegreturn_mutex);
> @@ -164,7 +165,7 @@ static int nfs_delegation_claim_opens(struct inode *inode,
>                         return err;
>                 goto again;
>         }
> -       spin_unlock(&inode->i_lock);
> +       rcu_read_unlock();
>         return 0;
>  }
>
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 052db41a7f80..5b1eee4952b7 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -977,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
>
>  struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
>  {
> -       if (ctx != NULL)
> -               refcount_inc(&ctx->lock_context.count);
> -       return ctx;
> +       if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count))
> +               return ctx;
> +       return NULL;
>  }
>  EXPORT_SYMBOL_GPL(get_nfs_open_context);
>
> @@ -988,13 +988,13 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
>         struct inode *inode = d_inode(ctx->dentry);
>         struct super_block *sb = ctx->dentry->d_sb;
>
> +       if (!refcount_dec_and_test(&ctx->lock_context.count))
> +               return;
>         if (!list_empty(&ctx->list)) {
> -               if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
> -                       return;
> -               list_del(&ctx->list);
> +               spin_lock(&inode->i_lock);
> +               list_del_rcu(&ctx->list);
>                 spin_unlock(&inode->i_lock);
> -       } else if (!refcount_dec_and_test(&ctx->lock_context.count))
> -               return;
> +       }
>         if (inode != NULL)
>                 NFS_PROTO(inode)->close_context(ctx, is_sync);
>         if (ctx->cred != NULL)
> @@ -1002,7 +1002,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
>         dput(ctx->dentry);
>         nfs_sb_deactive(sb);
>         kfree(ctx->mdsthreshold);
> -       kfree(ctx);
> +       kfree_rcu(ctx, rcu_head);
>  }
>
>  void put_nfs_open_context(struct nfs_open_context *ctx)
> @@ -1026,10 +1026,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
>         struct nfs_inode *nfsi = NFS_I(inode);
>
>         spin_lock(&inode->i_lock);
> -       if (ctx->mode & FMODE_WRITE)
> -               list_add(&ctx->list, &nfsi->open_files);
> -       else
> -               list_add_tail(&ctx->list, &nfsi->open_files);
> +       list_add_tail_rcu(&ctx->list, &nfsi->open_files);
>         spin_unlock(&inode->i_lock);
>  }
>  EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
> @@ -1050,16 +1047,17 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
>         struct nfs_inode *nfsi = NFS_I(inode);
>         struct nfs_open_context *pos, *ctx = NULL;
>
> -       spin_lock(&inode->i_lock);
> -       list_for_each_entry(pos, &nfsi->open_files, list) {
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(pos, &nfsi->open_files, list) {
>                 if (cred != NULL && pos->cred != cred)
>                         continue;
>                 if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
>                         continue;
>                 ctx = get_nfs_open_context(pos);
> -               break;
> +               if (ctx)
> +                       break;
>         }
> -       spin_unlock(&inode->i_lock);
> +       rcu_read_unlock();
>         return ctx;
>  }
>
> @@ -1077,9 +1075,6 @@ void nfs_file_clear_open_context(struct file *filp)
>                 if (ctx->error < 0)
>                         invalidate_inode_pages2(inode->i_mapping);
>                 filp->private_data = NULL;
> -               spin_lock(&inode->i_lock);
> -               list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
> -               spin_unlock(&inode->i_lock);
>                 put_nfs_open_context_sync(ctx);
>         }
>  }
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 8220a168282e..10c20a5b075d 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -1933,23 +1933,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
>         return ret;
>  }
>
> -static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
> +static struct nfs_open_context *
> +nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode)
>  {
>         struct nfs_inode *nfsi = NFS_I(state->inode);
>         struct nfs_open_context *ctx;
>
> -       spin_lock(&state->inode->i_lock);
> -       list_for_each_entry(ctx, &nfsi->open_files, list) {
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
>                 if (ctx->state != state)
>                         continue;
> -               get_nfs_open_context(ctx);
> -               spin_unlock(&state->inode->i_lock);
> +               if ((ctx->mode & mode) != mode)
> +                       continue;
> +               if (!get_nfs_open_context(ctx))
> +                       continue;
> +               rcu_read_unlock();
>                 return ctx;
>         }
> -       spin_unlock(&state->inode->i_lock);
> +       rcu_read_unlock();
>         return ERR_PTR(-ENOENT);
>  }
>
> +static struct nfs_open_context *
> +nfs4_state_find_open_context(struct nfs4_state *state)
> +{
> +       struct nfs_open_context *ctx;
> +
> +       ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE);
> +       if (!IS_ERR(ctx))
> +               return ctx;
> +       ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE);
> +       if (!IS_ERR(ctx))
> +               return ctx;
> +       return nfs4_state_find_open_context_mode(state, FMODE_READ);
> +}
> +
>  static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
>                 struct nfs4_state *state, enum open_claim_type4 claim)
>  {
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 40a08cd483f0..be92ce4259e9 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -1437,8 +1437,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
>         struct nfs4_state *state;
>         bool found = false;
>
> -       spin_lock(&inode->i_lock);
> -       list_for_each_entry(ctx, &nfsi->open_files, list) {
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
>                 state = ctx->state;
>                 if (state == NULL)
>                         continue;
> @@ -1456,7 +1456,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
>                     nfs4_state_mark_reclaim_nograce(clp, state))
>                         found = true;
>         }
> -       spin_unlock(&inode->i_lock);
> +       rcu_read_unlock();
>
>         nfs_inode_find_delegation_state_and_recover(inode, stateid);
>         if (found)
> @@ -1469,13 +1469,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
>         struct nfs_inode *nfsi = NFS_I(inode);
>         struct nfs_open_context *ctx;
>
> -       spin_lock(&inode->i_lock);
> -       list_for_each_entry(ctx, &nfsi->open_files, list) {
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
>                 if (ctx->state != state)
>                         continue;
>                 set_bit(NFS_CONTEXT_BAD, &ctx->flags);
>         }
> -       spin_unlock(&inode->i_lock);
> +       rcu_read_unlock();
>  }
>
>  static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index c5672c02afd6..06cb90e9bc6e 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1339,6 +1339,7 @@ bool pnfs_roc(struct inode *ino,
>         if (!nfs_have_layout(ino))
>                 return false;
>  retry:
> +       rcu_read_lock();
>         spin_lock(&ino->i_lock);
>         lo = nfsi->layout;
>         if (!lo || !pnfs_layout_is_valid(lo) ||
> @@ -1349,6 +1350,7 @@ bool pnfs_roc(struct inode *ino,
>         pnfs_get_layout_hdr(lo);
>         if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
>                 spin_unlock(&ino->i_lock);
> +               rcu_read_unlock();
>                 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
>                                 TASK_UNINTERRUPTIBLE);
>                 pnfs_put_layout_hdr(lo);
> @@ -1362,7 +1364,7 @@ bool pnfs_roc(struct inode *ino,
>                 skip_read = true;
>         }
>
> -       list_for_each_entry(ctx, &nfsi->open_files, list) {
> +       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
>                 state = ctx->state;
>                 if (state == NULL)
>                         continue;
> @@ -1410,6 +1412,7 @@ bool pnfs_roc(struct inode *ino,
>
>  out_noroc:
>         spin_unlock(&ino->i_lock);
> +       rcu_read_unlock();
>         pnfs_layoutcommit_inode(ino, true);
>         if (roc) {
>                 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index d2f4f88a0e66..6e0417c02279 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -83,6 +83,7 @@ struct nfs_open_context {
>
>         struct list_head list;
>         struct nfs4_threshold   *mdsthreshold;
> +       struct rcu_head rcu_head;
>  };
>
>  struct nfs_open_dir_context {
> --
> 2.17.1
>



[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux