Re: [PATCH 2/3] nfsd: replace rp_mutex to avoid deadlock in move_to_close_lru()

"NeilBrown" <neilb@xxxxxxx> · Mon, 04 Mar 2024 11:09:17 +1100

On Sat, 02 Mar 2024, Jeff Layton wrote:
> On Fri, 2024-03-01 at 11:07 +1100, NeilBrown wrote:
> > move_to_close_lru() waits for sc_count to become zero while holding
> > rp_mutex.  This can deadlock if another thread holds a reference and is
> > waiting for rp_mutex.
> > 
> > By the time we get to move_to_close_lru() the openowner is unhashed and
> > cannot be found any more.  So code waiting for the mutex can safely
> > retry the lookup if move_to_close_lru() has started.
> > 
> > So change rp_mutex to an atomic_t with three states:
> > 
> >  RP_UNLOCK   - state is still hashed, not locked for reply
> >  RP_LOCKED   - state is still hashed, is locked for reply
> >  RP_UNHASHED - state is now hashed, no code can get a lock.
> > 
> 
> "is now unhashed", I think...

 or "state is not hashed".  s/now/not/.

> 
> > Use wait_ver_event() to wait for either a lock, or for the owner to be
> > unhashed.  In the latter case, retry the lookup.
> > 
> > Signed-off-by: NeilBrown <neilb@xxxxxxx>
> > ---
> >  fs/nfsd/nfs4state.c | 46 ++++++++++++++++++++++++++++++++++++---------
> >  fs/nfsd/state.h     |  2 +-
> >  2 files changed, 38 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index e625f738f7b0..5dab316932d3 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -4442,21 +4442,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> >  }
> >  
> > +enum rp_lock {
> > +	RP_UNLOCKED,
> > +	RP_LOCKED,
> > +	RP_UNHASHED,
> > +};
> > +
> >  static void init_nfs4_replay(struct nfs4_replay *rp)
> >  {
> >  	rp->rp_status = nfserr_serverfault;
> >  	rp->rp_buflen = 0;
> >  	rp->rp_buf = rp->rp_ibuf;
> > -	mutex_init(&rp->rp_mutex);
> > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> >  }
> >  
> > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > -		struct nfs4_stateowner *so)
> > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > +				      struct nfs4_stateowner *so)
> >  {
> >  	if (!nfsd4_has_session(cstate)) {
> > -		mutex_lock(&so->so_replay.rp_mutex);
> > +		wait_var_event(&so->so_replay.rp_locked,
> > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> > +			return -EAGAIN;
> >  		cstate->replay_owner = nfs4_get_stateowner(so);
> >  	}
> > +	return 0;
> >  }
> >  
> >  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> > @@ -4465,7 +4476,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> >  
> >  	if (so != NULL) {
> >  		cstate->replay_owner = NULL;
> > -		mutex_unlock(&so->so_replay.rp_mutex);
> > +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> > +		wake_up_var(&so->so_replay.rp_locked);
> >  		nfs4_put_stateowner(so);
> >  	}
> >  }
> > @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> >  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
> >  	 * there should be no danger of the refcount going back up again at
> >  	 * this point.
> > +	 * Some threads with a reference might be waiting for rp_locked,
> > +	 * so tell them to stop waiting.
> >  	 */
> > +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> > +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
> >  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
> >  
> >  	release_all_access(s);
> > @@ -5064,6 +5080,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> >  	clp = cstate->clp;
> >  
> >  	strhashval = ownerstr_hashval(&open->op_owner);
> > +retry:
> >  	oo = find_openstateowner_str(strhashval, open, clp);
> >  	open->op_openowner = oo;
> >  	if (!oo)
> > @@ -5074,17 +5091,24 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> >  		open->op_openowner = NULL;
> >  		goto new_owner;
> >  	}
> > -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> > +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> > +		nfs4_put_stateowner(&oo->oo_owner);
> > +		goto retry;
> > +	}
> >  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
> >  	if (status)
> >  		return status;
> >  	goto alloc_stateid;
> >  new_owner:
> >  	oo = alloc_init_open_stateowner(strhashval, open, cstate);
> > +	open->op_openowner = oo;
> >  	if (oo == NULL)
> >  		return nfserr_jukebox;
> > -	open->op_openowner = oo;
> > -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> > +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> > +		WARN_ON(1);
> 
> I don't think you want to WARN here. It seems quite possible for the
> client to send simultaneous opens for different files with the same
> stateowner.

Thanks.  alloc_init_open_stateowner() might not return a newly allocated
state owner, as it include the test "does it already exist" and also
adds it to the hash.  Not what I would expect based on the name.  Maybe
I'll clean up the code.

> 
> 
> 
> > +		nfs4_put_stateowner(&oo->oo_owner);
> > +		goto new_owner;
> 
> Is "goto new_owner" correct here? We likely raced with another RPC that
> was using the same owner, so ours probably got inserted and the other
> nfsd thread raced in and got the lock before we could. Retrying the
> lookup seems more correct in this situation?
> 
> That said, it might be best to just call nfsd4_cstate_assign_replay
> before hashing the new owner. If you lose the insertion race at that
> point, you can just clear it and try to assign the one that won.

I did consider that approach but wasn't sure it was worth it.  I'll have
another look.

Thanks for the review.

NeilBrown

> 
> > +	}
> >  alloc_stateid:
> >  	open->op_stp = nfs4_alloc_open_stateid(clp);
> >  	if (!open->op_stp)
> > @@ -6841,11 +6865,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
> >  	trace_nfsd_preprocess(seqid, stateid);
> >  
> >  	*stpp = NULL;
> > +retry:
> >  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
> >  	if (status)
> >  		return status;
> >  	stp = openlockstateid(s);
> > -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> > +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> > +		nfs4_put_stateowner(stp->st_stateowner);
> > +		goto retry;
> > +	}
> >  
> >  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
> >  	if (!status)
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index 41bdc913fa71..6a3becd86112 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -446,7 +446,7 @@ struct nfs4_replay {
> >  	unsigned int		rp_buflen;
> >  	char			*rp_buf;
> >  	struct knfsd_fh		rp_openfh;
> > -	struct mutex		rp_mutex;
> > +	atomic_t		rp_locked;
> >  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
> >  };
> >  
> 
> -- 
> Jeff Layton <jlayton@xxxxxxxxxx>
> 
>