On Fri, 2018-09-28 at 12:54 -0400, Olga Kornievskaia wrote: > Also shouldn't this be a bug fix for 4.19 and also go to stable? It wasn't really intended as a bugfix because I wasn't aware that there was a bug to fix in the first place. I assume the modification to nfs4_state_find_open_context() to cause it to look for an open context with a READ/WRITE open mode first is what fixes the bug. Is that the case? > On Fri, Sep 28, 2018 at 12:34 PM Olga Kornievskaia <aglo@xxxxxxxxx> > wrote: > > > > Hi Trond, > > > > This patch ends up fixing a problem related to recall of > > delegations. > > This problem has existed in the kernel for quite awhile (early RHEL > > 3.10-based release show the problem, haven't tried 2.6-based). > > > > If you agree with what I present, then can the commit message for > > this > > patch be changed to reflect that it solves this problem? > > > > Problem can be seen by running "nfstest_delegation --runtests > > recall26". Jorge can probably describe the test better but I > > believe > > the main client machine opens a file for read and gets a read > > delegation to it and locks it, then from a different process in the > > same machine there is an appending open (open for write is sent and > > gets a write delegation). From a different machine another client > > opens the same file which triggers a CB_RECALL. The main client, > > un-patched, ends up simply returning the delegation and never > > recovers > > the lock. This is pretty bad as it's possible data corruption case. > > > > In the nfs_delegation_claim_locks() this condition -- if > > (nfs_file_open_context(fl->fl_file) != ctx) -- is true when it > > should > > have been false. > > > > There seems to be a disagreement between what's in > > nfs_file_open_context(fl->fl_file) and what > > get_nfs_open_context(inode's ctx) returns but I don't think how > > this > > patch fixes it. But it does. > > On Wed, Sep 5, 2018 at 3:27 PM Trond Myklebust <trondmy@xxxxxxxxx> > > wrote: > > > > > > Reduce contention on the inode->i_lock by ensuring that we use > > > RCU > > > when looking up the NFS open context. > > > > > > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > > > --- > > > fs/nfs/delegation.c | 11 ++++++----- > > > fs/nfs/inode.c | 35 +++++++++++++++-------------------- > > > fs/nfs/nfs4proc.c | 30 ++++++++++++++++++++++++------ > > > fs/nfs/nfs4state.c | 12 ++++++------ > > > fs/nfs/pnfs.c | 5 ++++- > > > include/linux/nfs_fs.h | 1 + > > > 6 files changed, 56 insertions(+), 38 deletions(-) > > > > > > diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c > > > index f033f3a69a3b..76d205d1c7bc 100644 > > > --- a/fs/nfs/delegation.c > > > +++ b/fs/nfs/delegation.c > > > @@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct > > > inode *inode, > > > int err; > > > > > > again: > > > - spin_lock(&inode->i_lock); > > > - list_for_each_entry(ctx, &nfsi->open_files, list) { > > > + rcu_read_lock(); > > > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > > > state = ctx->state; > > > if (state == NULL) > > > continue; > > > @@ -147,8 +147,9 @@ static int nfs_delegation_claim_opens(struct > > > inode *inode, > > > continue; > > > if (!nfs4_stateid_match(&state->stateid, > > > stateid)) > > > continue; > > > - get_nfs_open_context(ctx); > > > - spin_unlock(&inode->i_lock); > > > + if (!get_nfs_open_context(ctx)) > > > + continue; > > > + rcu_read_unlock(); > > > sp = state->owner; > > > /* Block nfs4_proc_unlck */ > > > mutex_lock(&sp->so_delegreturn_mutex); > > > @@ -164,7 +165,7 @@ static int nfs_delegation_claim_opens(struct > > > inode *inode, > > > return err; > > > goto again; > > > } > > > - spin_unlock(&inode->i_lock); > > > + rcu_read_unlock(); > > > return 0; > > > } > > > > > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > > > index 052db41a7f80..5b1eee4952b7 100644 > > > --- a/fs/nfs/inode.c > > > +++ b/fs/nfs/inode.c > > > @@ -977,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); > > > > > > struct nfs_open_context *get_nfs_open_context(struct > > > nfs_open_context *ctx) > > > { > > > - if (ctx != NULL) > > > - refcount_inc(&ctx->lock_context.count); > > > - return ctx; > > > + if (ctx != NULL && refcount_inc_not_zero(&ctx- > > > >lock_context.count)) > > > + return ctx; > > > + return NULL; > > > } > > > EXPORT_SYMBOL_GPL(get_nfs_open_context); > > > > > > @@ -988,13 +988,13 @@ static void __put_nfs_open_context(struct > > > nfs_open_context *ctx, int is_sync) > > > struct inode *inode = d_inode(ctx->dentry); > > > struct super_block *sb = ctx->dentry->d_sb; > > > > > > + if (!refcount_dec_and_test(&ctx->lock_context.count)) > > > + return; > > > if (!list_empty(&ctx->list)) { > > > - if (!refcount_dec_and_lock(&ctx- > > > >lock_context.count, &inode->i_lock)) > > > - return; > > > - list_del(&ctx->list); > > > + spin_lock(&inode->i_lock); > > > + list_del_rcu(&ctx->list); > > > spin_unlock(&inode->i_lock); > > > - } else if (!refcount_dec_and_test(&ctx- > > > >lock_context.count)) > > > - return; > > > + } > > > if (inode != NULL) > > > NFS_PROTO(inode)->close_context(ctx, is_sync); > > > if (ctx->cred != NULL) > > > @@ -1002,7 +1002,7 @@ static void __put_nfs_open_context(struct > > > nfs_open_context *ctx, int is_sync) > > > dput(ctx->dentry); > > > nfs_sb_deactive(sb); > > > kfree(ctx->mdsthreshold); > > > - kfree(ctx); > > > + kfree_rcu(ctx, rcu_head); > > > } > > > > > > void put_nfs_open_context(struct nfs_open_context *ctx) > > > @@ -1026,10 +1026,7 @@ void nfs_inode_attach_open_context(struct > > > nfs_open_context *ctx) > > > struct nfs_inode *nfsi = NFS_I(inode); > > > > > > spin_lock(&inode->i_lock); > > > - if (ctx->mode & FMODE_WRITE) > > > - list_add(&ctx->list, &nfsi->open_files); > > > - else > > > - list_add_tail(&ctx->list, &nfsi->open_files); > > > + list_add_tail_rcu(&ctx->list, &nfsi->open_files); > > > spin_unlock(&inode->i_lock); > > > } > > > EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); > > > @@ -1050,16 +1047,17 @@ struct nfs_open_context > > > *nfs_find_open_context(struct inode *inode, struct rpc_c > > > struct nfs_inode *nfsi = NFS_I(inode); > > > struct nfs_open_context *pos, *ctx = NULL; > > > > > > - spin_lock(&inode->i_lock); > > > - list_for_each_entry(pos, &nfsi->open_files, list) { > > > + rcu_read_lock(); > > > + list_for_each_entry_rcu(pos, &nfsi->open_files, list) { > > > if (cred != NULL && pos->cred != cred) > > > continue; > > > if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != > > > mode) > > > continue; > > > ctx = get_nfs_open_context(pos); > > > - break; > > > + if (ctx) > > > + break; > > > } > > > - spin_unlock(&inode->i_lock); > > > + rcu_read_unlock(); > > > return ctx; > > > } > > > > > > @@ -1077,9 +1075,6 @@ void nfs_file_clear_open_context(struct > > > file *filp) > > > if (ctx->error < 0) > > > invalidate_inode_pages2(inode- > > > >i_mapping); > > > filp->private_data = NULL; > > > - spin_lock(&inode->i_lock); > > > - list_move_tail(&ctx->list, &NFS_I(inode)- > > > >open_files); > > > - spin_unlock(&inode->i_lock); > > > put_nfs_open_context_sync(ctx); > > > } > > > } > > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > > > index 8220a168282e..10c20a5b075d 100644 > > > --- a/fs/nfs/nfs4proc.c > > > +++ b/fs/nfs/nfs4proc.c > > > @@ -1933,23 +1933,41 @@ nfs4_opendata_to_nfs4_state(struct > > > nfs4_opendata *data) > > > return ret; > > > } > > > > > > -static struct nfs_open_context > > > *nfs4_state_find_open_context(struct nfs4_state *state) > > > +static struct nfs_open_context * > > > +nfs4_state_find_open_context_mode(struct nfs4_state *state, > > > fmode_t mode) > > > { > > > struct nfs_inode *nfsi = NFS_I(state->inode); > > > struct nfs_open_context *ctx; > > > > > > - spin_lock(&state->inode->i_lock); > > > - list_for_each_entry(ctx, &nfsi->open_files, list) { > > > + rcu_read_lock(); > > > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > > > if (ctx->state != state) > > > continue; > > > - get_nfs_open_context(ctx); > > > - spin_unlock(&state->inode->i_lock); > > > + if ((ctx->mode & mode) != mode) > > > + continue; > > > + if (!get_nfs_open_context(ctx)) > > > + continue; > > > + rcu_read_unlock(); > > > return ctx; > > > } > > > - spin_unlock(&state->inode->i_lock); > > > + rcu_read_unlock(); > > > return ERR_PTR(-ENOENT); > > > } > > > > > > +static struct nfs_open_context * > > > +nfs4_state_find_open_context(struct nfs4_state *state) > > > +{ > > > + struct nfs_open_context *ctx; > > > + > > > + ctx = nfs4_state_find_open_context_mode(state, > > > FMODE_READ|FMODE_WRITE); > > > + if (!IS_ERR(ctx)) > > > + return ctx; > > > + ctx = nfs4_state_find_open_context_mode(state, > > > FMODE_WRITE); > > > + if (!IS_ERR(ctx)) > > > + return ctx; > > > + return nfs4_state_find_open_context_mode(state, > > > FMODE_READ); > > > +} > > > + > > > static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct > > > nfs_open_context *ctx, > > > struct nfs4_state *state, enum open_claim_type4 > > > claim) > > > { > > > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > > > index 40a08cd483f0..be92ce4259e9 100644 > > > --- a/fs/nfs/nfs4state.c > > > +++ b/fs/nfs/nfs4state.c > > > @@ -1437,8 +1437,8 @@ void > > > nfs_inode_find_state_and_recover(struct inode *inode, > > > struct nfs4_state *state; > > > bool found = false; > > > > > > - spin_lock(&inode->i_lock); > > > - list_for_each_entry(ctx, &nfsi->open_files, list) { > > > + rcu_read_lock(); > > > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > > > state = ctx->state; > > > if (state == NULL) > > > continue; > > > @@ -1456,7 +1456,7 @@ void > > > nfs_inode_find_state_and_recover(struct inode *inode, > > > nfs4_state_mark_reclaim_nograce(clp, state)) > > > found = true; > > > } > > > - spin_unlock(&inode->i_lock); > > > + rcu_read_unlock(); > > > > > > nfs_inode_find_delegation_state_and_recover(inode, > > > stateid); > > > if (found) > > > @@ -1469,13 +1469,13 @@ static void > > > nfs4_state_mark_open_context_bad(struct nfs4_state *state) > > > struct nfs_inode *nfsi = NFS_I(inode); > > > struct nfs_open_context *ctx; > > > > > > - spin_lock(&inode->i_lock); > > > - list_for_each_entry(ctx, &nfsi->open_files, list) { > > > + rcu_read_lock(); > > > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > > > if (ctx->state != state) > > > continue; > > > set_bit(NFS_CONTEXT_BAD, &ctx->flags); > > > } > > > - spin_unlock(&inode->i_lock); > > > + rcu_read_unlock(); > > > } > > > > > > static void nfs4_state_mark_recovery_failed(struct nfs4_state > > > *state, int error) > > > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c > > > index c5672c02afd6..06cb90e9bc6e 100644 > > > --- a/fs/nfs/pnfs.c > > > +++ b/fs/nfs/pnfs.c > > > @@ -1339,6 +1339,7 @@ bool pnfs_roc(struct inode *ino, > > > if (!nfs_have_layout(ino)) > > > return false; > > > retry: > > > + rcu_read_lock(); > > > spin_lock(&ino->i_lock); > > > lo = nfsi->layout; > > > if (!lo || !pnfs_layout_is_valid(lo) || > > > @@ -1349,6 +1350,7 @@ bool pnfs_roc(struct inode *ino, > > > pnfs_get_layout_hdr(lo); > > > if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { > > > spin_unlock(&ino->i_lock); > > > + rcu_read_unlock(); > > > wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, > > > TASK_UNINTERRUPTIBLE); > > > pnfs_put_layout_hdr(lo); > > > @@ -1362,7 +1364,7 @@ bool pnfs_roc(struct inode *ino, > > > skip_read = true; > > > } > > > > > > - list_for_each_entry(ctx, &nfsi->open_files, list) { > > > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > > > state = ctx->state; > > > if (state == NULL) > > > continue; > > > @@ -1410,6 +1412,7 @@ bool pnfs_roc(struct inode *ino, > > > > > > out_noroc: > > > spin_unlock(&ino->i_lock); > > > + rcu_read_unlock(); > > > pnfs_layoutcommit_inode(ino, true); > > > if (roc) { > > > struct pnfs_layoutdriver_type *ld = > > > NFS_SERVER(ino)->pnfs_curr_ld; > > > diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h > > > index d2f4f88a0e66..6e0417c02279 100644 > > > --- a/include/linux/nfs_fs.h > > > +++ b/include/linux/nfs_fs.h > > > @@ -83,6 +83,7 @@ struct nfs_open_context { > > > > > > struct list_head list; > > > struct nfs4_threshold *mdsthreshold; > > > + struct rcu_head rcu_head; > > > }; > > > > > > struct nfs_open_dir_context { > > > -- > > > 2.17.1 > > >