Re: [PATCH 3/4] nfsd: replace rp_mutex to avoid deadlock in move_to_close_lru()

Chuck Lever <chuck.lever@xxxxxxxxxx> · Mon, 4 Mar 2024 09:09:28 -0500

On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> move_to_close_lru() waits for sc_count to become zero while holding
> rp_mutex.  This can deadlock if another thread holds a reference and is
> waiting for rp_mutex.
> 
> By the time we get to move_to_close_lru() the openowner is unhashed and
> cannot be found any more.  So code waiting for the mutex can safely
> retry the lookup if move_to_close_lru() has started.
> 
> So change rp_mutex to an atomic_t with three states:
> 
>  RP_UNLOCK   - state is still hashed, not locked for reply
>  RP_LOCKED   - state is still hashed, is locked for reply
>  RP_UNHASHED - state is not hashed, no code can get a lock.
> 
> Use wait_var_event() to wait for either a lock, or for the owner to be
> unhashed.  In the latter case, retry the lookup.
> 
> Signed-off-by: NeilBrown <neilb@xxxxxxx>
> ---
>  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
>  fs/nfsd/state.h     |  2 +-
>  2 files changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 690d0e697320..47e879d5d68a 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
>  	atomic_set(&nn->nfsd_courtesy_clients, 0);
>  }
>  
> +enum rp_lock {
> +	RP_UNLOCKED,
> +	RP_LOCKED,
> +	RP_UNHASHED,
> +};
> +
>  static void init_nfs4_replay(struct nfs4_replay *rp)
>  {
>  	rp->rp_status = nfserr_serverfault;
>  	rp->rp_buflen = 0;
>  	rp->rp_buf = rp->rp_ibuf;
> -	mutex_init(&rp->rp_mutex);
> +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
>  }
>  
> -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> -		struct nfs4_stateowner *so)
> +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> +				      struct nfs4_stateowner *so)
>  {
>  	if (!nfsd4_has_session(cstate)) {
> -		mutex_lock(&so->so_replay.rp_mutex);
> +		wait_var_event(&so->so_replay.rp_locked,
> +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);

What reliably prevents this from being a "wait forever" ?

> +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> +			return -EAGAIN;
>  		cstate->replay_owner = nfs4_get_stateowner(so);
>  	}
> +	return 0;
>  }
>  
>  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> @@ -4453,7 +4464,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
>  
>  	if (so != NULL) {
>  		cstate->replay_owner = NULL;
> -		mutex_unlock(&so->so_replay.rp_mutex);
> +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> +		wake_up_var(&so->so_replay.rp_locked);
>  		nfs4_put_stateowner(so);
>  	}
>  }
> @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
>  	 * there should be no danger of the refcount going back up again at
>  	 * this point.
> +	 * Some threads with a reference might be waiting for rp_locked,
> +	 * so tell them to stop waiting.
>  	 */
> +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
>  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
>  
>  	release_all_access(s);
> @@ -5064,11 +5080,15 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
>  	clp = cstate->clp;
>  
>  	strhashval = ownerstr_hashval(&open->op_owner);
> +retry:
>  	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
>  	open->op_openowner = oo;
>  	if (!oo)
>  		return nfserr_jukebox;
> -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> +		nfs4_put_stateowner(&oo->oo_owner);
> +		goto retry;
> +	}
>  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
>  	if (status)
>  		return status;
> @@ -6828,11 +6848,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
>  	trace_nfsd_preprocess(seqid, stateid);
>  
>  	*stpp = NULL;
> +retry:
>  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
>  	if (status)
>  		return status;
>  	stp = openlockstateid(s);
> -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> +		nfs4_put_stateowner(stp->st_stateowner);
> +		goto retry;
> +	}
>  
>  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
>  	if (!status)

My tool chain reports that this hunk won't apply to nfsd-next.

In my copy of fs/nfsd/nfs4state.c, nfs4_preprocess_seqid_op() starts
at line 7180, so something is whack-a-doodle.

> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 41bdc913fa71..6a3becd86112 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -446,7 +446,7 @@ struct nfs4_replay {
>  	unsigned int		rp_buflen;
>  	char			*rp_buf;
>  	struct knfsd_fh		rp_openfh;
> -	struct mutex		rp_mutex;
> +	atomic_t		rp_locked;
>  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
>  };
>  
> -- 
> 2.43.0
> 

-- 
Chuck Lever