On Thu, 2017-10-19 at 14:33 -0400, Olga Kornievskaia wrote: > On Wed, Oct 11, 2017 at 1:07 PM, Trond Myklebust > <trond.myklebust@xxxxxxxxxxxxxxx> wrote: > > If the previous request on a slot was interrupted before it was > > processed by the server, then our slot sequence number may be out > > of whack, > > and so we try the next operation using the old sequence number. > > > > The problem with this, is that not all servers check to see that > > the > > client is replaying the same operations as previously when they > > decide > > to go to the replay cache, and so instead of the expected error of > > NFS4ERR_SEQ_FALSE_RETRY, we get a replay of the old reply, which > > could > > (if the operations match up) be mistaken by the client for a new > > reply. > > > > To fix this, we attempt to send a COMPOUND containing only the > > SEQUENCE op > > in order to resync our slot sequence number. > > > > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > > --- > > fs/nfs/nfs4_fs.h | 2 +- > > fs/nfs/nfs4proc.c | 146 +++++++++++++++++++++++++++++++++++++----- > > ------------ > > 2 files changed, 101 insertions(+), 47 deletions(-) > > > > diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h > > index ac4f10b7f6c1..b547d935aaf0 100644 > > --- a/fs/nfs/nfs4_fs.h > > +++ b/fs/nfs/nfs4_fs.h > > @@ -464,7 +464,7 @@ extern void nfs_increment_open_seqid(int > > status, struct nfs_seqid *seqid); > > extern void nfs_increment_lock_seqid(int status, struct nfs_seqid > > *seqid); > > extern void nfs_release_seqid(struct nfs_seqid *seqid); > > extern void nfs_free_seqid(struct nfs_seqid *seqid); > > -extern int nfs4_setup_sequence(const struct nfs_client *client, > > +extern int nfs4_setup_sequence(struct nfs_client *client, > > struct nfs4_sequence_args *args, > > struct nfs4_sequence_res *res, > > struct rpc_task *task); > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > > index f90090e8c959..caa72efe02c9 100644 > > --- a/fs/nfs/nfs4proc.c > > +++ b/fs/nfs/nfs4proc.c > > @@ -96,6 +96,10 @@ static int nfs4_do_setattr(struct inode *inode, > > struct rpc_cred *cred, > > struct nfs_open_context *ctx, struct > > nfs4_label *ilabel, > > struct nfs4_label *olabel); > > #ifdef CONFIG_NFS_V4_1 > > +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client > > *clp, > > + struct rpc_cred *cred, > > + struct nfs4_slot *slot, > > + bool is_privileged); > > static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, > > struct rpc_cred *); > > static int nfs41_free_stateid(struct nfs_server *, const > > nfs4_stateid *, > > @@ -644,13 +648,14 @@ static int nfs40_sequence_done(struct > > rpc_task *task, > > > > #if defined(CONFIG_NFS_V4_1) > > > > -static void nfs41_sequence_free_slot(struct nfs4_sequence_res > > *res) > > +static void nfs41_release_slot(struct nfs4_slot *slot) > > { > > struct nfs4_session *session; > > struct nfs4_slot_table *tbl; > > - struct nfs4_slot *slot = res->sr_slot; > > bool send_new_highest_used_slotid = false; > > > > + if (!slot) > > + return; > > tbl = slot->table; > > session = tbl->session; > > > > @@ -676,13 +681,18 @@ static void nfs41_sequence_free_slot(struct > > nfs4_sequence_res *res) > > send_new_highest_used_slotid = false; > > out_unlock: > > spin_unlock(&tbl->slot_tbl_lock); > > - res->sr_slot = NULL; > > if (send_new_highest_used_slotid) > > nfs41_notify_server(session->clp); > > if (waitqueue_active(&tbl->slot_waitq)) > > wake_up_all(&tbl->slot_waitq); > > } > > > > +static void nfs41_sequence_free_slot(struct nfs4_sequence_res > > *res) > > +{ > > + nfs41_release_slot(res->sr_slot); > > + res->sr_slot = NULL; > > +} > > + > > static int nfs41_sequence_process(struct rpc_task *task, > > struct nfs4_sequence_res *res) > > { > > @@ -710,13 +720,6 @@ static int nfs41_sequence_process(struct > > rpc_task *task, > > /* Check the SEQUENCE operation status */ > > switch (res->sr_status) { > > case 0: > > - /* If previous op on slot was interrupted and we > > reused > > - * the seq# and got a reply from the cache, then > > retry > > - */ > > - if (task->tk_status == -EREMOTEIO && interrupted) { > > - ++slot->seq_nr; > > - goto retry_nowait; > > - } > > /* Update the slot's sequence and clientid lease > > timer */ > > slot->seq_done = 1; > > clp = session->clp; > > @@ -750,16 +753,16 @@ static int nfs41_sequence_process(struct > > rpc_task *task, > > * The slot id we used was probably retired. Try > > again > > * using a different slot id. > > */ > > + if (slot->seq_nr < slot->table- > > >target_highest_slotid) > > + goto session_recover; > > goto retry_nowait; > > case -NFS4ERR_SEQ_MISORDERED: > > /* > > * Was the last operation on this sequence > > interrupted? > > * If so, retry after bumping the sequence number. > > */ > > - if (interrupted) { > > - ++slot->seq_nr; > > - goto retry_nowait; > > - } > > + if (interrupted) > > + goto retry_new_seq; > > /* > > * Could this slot have been previously retired? > > * If so, then the server may be expecting seq_nr = > > 1! > > @@ -768,10 +771,11 @@ static int nfs41_sequence_process(struct > > rpc_task *task, > > slot->seq_nr = 1; > > goto retry_nowait; > > } > > - break; > > + goto session_recover; > > case -NFS4ERR_SEQ_FALSE_RETRY: > > - ++slot->seq_nr; > > - goto retry_nowait; > > + if (interrupted) > > + goto retry_new_seq; > > + goto session_recover; > > default: > > /* Just update the slot sequence no. */ > > slot->seq_done = 1; > > @@ -781,6 +785,11 @@ static int nfs41_sequence_process(struct > > rpc_task *task, > > dprintk("%s: Error %d free the slot \n", __func__, res- > > >sr_status); > > out_noaction: > > return ret; > > +session_recover: > > + nfs4_schedule_session_recovery(session, res->sr_status); > > + goto retry_nowait; > > +retry_new_seq: > > + ++slot->seq_nr; > > retry_nowait: > > if (rpc_restart_call_prepare(task)) { > > nfs41_sequence_free_slot(res); > > @@ -857,6 +866,17 @@ static const struct rpc_call_ops > > nfs41_call_sync_ops = { > > .rpc_call_done = nfs41_call_sync_done, > > }; > > > > +static void > > +nfs4_sequence_process_interrupted(struct nfs_client *client, > > + struct nfs4_slot *slot, struct rpc_cred *cred) > > +{ > > + struct rpc_task *task; > > + > > + task = _nfs41_proc_sequence(client, cred, slot, true); > > + if (!IS_ERR(task)) > > + rpc_put_task_async(task); > > +} > > + > > #else /* !CONFIG_NFS_V4_1 */ > > > > static int nfs4_sequence_process(struct rpc_task *task, struct > > nfs4_sequence_res *res) > > @@ -877,9 +897,32 @@ int nfs4_sequence_done(struct rpc_task *task, > > } > > EXPORT_SYMBOL_GPL(nfs4_sequence_done); > > > > +static void > > +nfs4_sequence_process_interrupted(struct nfs_client *client, > > + struct nfs4_slot *slot, struct rpc_cred *cred) > > +{ > > + WARN_ON_ONCE(1); > > + slot->interrupted = 0; > > +} > > + > > #endif /* !CONFIG_NFS_V4_1 */ > > > > -int nfs4_setup_sequence(const struct nfs_client *client, > > +static > > +void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, > > + struct nfs4_sequence_res *res, > > + struct nfs4_slot *slot) > > +{ > > + slot->privileged = args->sa_privileged ? 1 : 0; > > The oops is because in the async sequence, it passes NULL as the slot > which ends up calling this and trying to dereference "slot". > > Btw, no operations are needed to actually trigger this. Just do the > mount and wait for the first renew. > > Don't know if perhaps the fix is to check that > if (slot) Doh! Yes, that was a last minute cleanup. I thought I did the same for both nfs4_sequence_attach_slot() and nfs41_release_slot(), but it looks as if I only fixed up the latter. Thanks! Trond -- Trond Myklebust Linux NFS client maintainer, PrimaryData trond.myklebust@xxxxxxxxxxxxxxx ��.n��������+%������w��{.n�����{��w���jg��������ݢj����G�������j:+v���w�m������w�������h�����٥