Re: [PATCH 2/3] NFSD: restore delegation's sc_count if nfsd4_run_cb fails

Jeff Layton <jlayton@xxxxxxxxxx> · Fri, 15 Dec 2023 14:42:44 -0500

On Fri, 2023-12-15 at 11:15 -0800, Dai Ngo wrote:
> Under some load conditions the callback work request can not be queued
> and nfsd4_run_cb returns 0 to caller. When this happens, the sc_count
> of the delegation state was left with an extra reference count preventing
> the state to be freed later.
> 
> Signed-off-by: Dai Ngo <dai.ngo@xxxxxxxxxx>
> ---
>  fs/nfsd/nfs4state.c | 17 +++++++++++++----
>  1 file changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..175f3e9f5822 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2947,8 +2947,14 @@ void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
>  
>  	if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
>  		return;
> +
>  	refcount_inc(&dp->dl_stid.sc_count);
> -	nfsd4_run_cb(&ncf->ncf_getattr);
> +	if (!nfsd4_run_cb(&ncf->ncf_getattr)) {
> +		refcount_dec(&dp->dl_stid.sc_count);
> +		clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
> +		wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
> +		WARN_ON_ONCE(1);
> +	}
>  }
>  
>  static struct nfs4_client *create_client(struct xdr_netobj name,
> @@ -4967,7 +4973,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>  	 * we know it's safe to take a reference.
>  	 */
>  	refcount_inc(&dp->dl_stid.sc_count);
> -	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
> +	if (!nfsd4_run_cb(&dp->dl_recall)) {
> +		refcount_dec(&dp->dl_stid.sc_count);
> +		WARN_ON_ONCE(1);
> +	}
>  }
>  
>  /* Called from break_lease() with flc_lock held. */
> @@ -8543,12 +8552,12 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
>  				return 0;
>  			}
>  break_lease:
> -			spin_unlock(&ctx->flc_lock);
>  			nfsd_stats_wdeleg_getattr_inc();
> -
>  			dp = fl->fl_owner;
>  			ncf = &dp->dl_cb_fattr;
>  			nfs4_cb_getattr(&dp->dl_cb_fattr);
> +			spin_unlock(&ctx->flc_lock);
> +

The other hunks in this patch make sense, but what's going on here with
moving the lock down? Do we really need to hold the spinlock there? If
so, I would have expected to see an explanation in the changelog.

>  			wait_on_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, TASK_INTERRUPTIBLE);
>  			if (ncf->ncf_cb_status) {
>  				status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));

-- 
Jeff Layton <jlayton@xxxxxxxxxx>