Re: [PATCH 2/3] nfsd: replace rp_mutex to avoid deadlock in move_to_close_lru()

Jeff Layton <jlayton@xxxxxxxxxx> · Fri, 01 Mar 2024 08:09:17 -0500

On Fri, 2024-03-01 at 11:07 +1100, NeilBrown wrote:
> move_to_close_lru() waits for sc_count to become zero while holding
> rp_mutex.  This can deadlock if another thread holds a reference and is
> waiting for rp_mutex.
> 
> By the time we get to move_to_close_lru() the openowner is unhashed and
> cannot be found any more.  So code waiting for the mutex can safely
> retry the lookup if move_to_close_lru() has started.
> 
> So change rp_mutex to an atomic_t with three states:
> 
>  RP_UNLOCK   - state is still hashed, not locked for reply
>  RP_LOCKED   - state is still hashed, is locked for reply
>  RP_UNHASHED - state is now hashed, no code can get a lock.
> 

"is now unhashed", I think...

> Use wait_ver_event() to wait for either a lock, or for the owner to be
> unhashed.  In the latter case, retry the lookup.
> 
> Signed-off-by: NeilBrown <neilb@xxxxxxx>
> ---
>  fs/nfsd/nfs4state.c | 46 ++++++++++++++++++++++++++++++++++++---------
>  fs/nfsd/state.h     |  2 +-
>  2 files changed, 38 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index e625f738f7b0..5dab316932d3 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4442,21 +4442,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
>  	atomic_set(&nn->nfsd_courtesy_clients, 0);
>  }
>  
> +enum rp_lock {
> +	RP_UNLOCKED,
> +	RP_LOCKED,
> +	RP_UNHASHED,
> +};
> +
>  static void init_nfs4_replay(struct nfs4_replay *rp)
>  {
>  	rp->rp_status = nfserr_serverfault;
>  	rp->rp_buflen = 0;
>  	rp->rp_buf = rp->rp_ibuf;
> -	mutex_init(&rp->rp_mutex);
> +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
>  }
>  
> -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> -		struct nfs4_stateowner *so)
> +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> +				      struct nfs4_stateowner *so)
>  {
>  	if (!nfsd4_has_session(cstate)) {
> -		mutex_lock(&so->so_replay.rp_mutex);
> +		wait_var_event(&so->so_replay.rp_locked,
> +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> +			return -EAGAIN;
>  		cstate->replay_owner = nfs4_get_stateowner(so);
>  	}
> +	return 0;
>  }
>  
>  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> @@ -4465,7 +4476,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
>  
>  	if (so != NULL) {
>  		cstate->replay_owner = NULL;
> -		mutex_unlock(&so->so_replay.rp_mutex);
> +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> +		wake_up_var(&so->so_replay.rp_locked);
>  		nfs4_put_stateowner(so);
>  	}
>  }
> @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
>  	 * there should be no danger of the refcount going back up again at
>  	 * this point.
> +	 * Some threads with a reference might be waiting for rp_locked,
> +	 * so tell them to stop waiting.
>  	 */
> +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
>  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
>  
>  	release_all_access(s);
> @@ -5064,6 +5080,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
>  	clp = cstate->clp;
>  
>  	strhashval = ownerstr_hashval(&open->op_owner);
> +retry:
>  	oo = find_openstateowner_str(strhashval, open, clp);
>  	open->op_openowner = oo;
>  	if (!oo)
> @@ -5074,17 +5091,24 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
>  		open->op_openowner = NULL;
>  		goto new_owner;
>  	}
> -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> +		nfs4_put_stateowner(&oo->oo_owner);
> +		goto retry;
> +	}
>  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
>  	if (status)
>  		return status;
>  	goto alloc_stateid;
>  new_owner:
>  	oo = alloc_init_open_stateowner(strhashval, open, cstate);
> +	open->op_openowner = oo;
>  	if (oo == NULL)
>  		return nfserr_jukebox;
> -	open->op_openowner = oo;
> -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> +		WARN_ON(1);

I don't think you want to WARN here. It seems quite possible for the
client to send simultaneous opens for different files with the same
stateowner.

> +		nfs4_put_stateowner(&oo->oo_owner);
> +		goto new_owner;

Is "goto new_owner" correct here? We likely raced with another RPC that
was using the same owner, so ours probably got inserted and the other
nfsd thread raced in and got the lock before we could. Retrying the
lookup seems more correct in this situation?

That said, it might be best to just call nfsd4_cstate_assign_replay
before hashing the new owner. If you lose the insertion race at that
point, you can just clear it and try to assign the one that won.

> +	}
>  alloc_stateid:
>  	open->op_stp = nfs4_alloc_open_stateid(clp);
>  	if (!open->op_stp)
> @@ -6841,11 +6865,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
>  	trace_nfsd_preprocess(seqid, stateid);
>  
>  	*stpp = NULL;
> +retry:
>  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
>  	if (status)
>  		return status;
>  	stp = openlockstateid(s);
> -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> +		nfs4_put_stateowner(stp->st_stateowner);
> +		goto retry;
> +	}
>  
>  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
>  	if (!status)
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 41bdc913fa71..6a3becd86112 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -446,7 +446,7 @@ struct nfs4_replay {
>  	unsigned int		rp_buflen;
>  	char			*rp_buf;
>  	struct knfsd_fh		rp_openfh;
> -	struct mutex		rp_mutex;
> +	atomic_t		rp_locked;
>  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
>  };
>  

-- 
Jeff Layton <jlayton@xxxxxxxxxx>