Re: [PATCH v2 1/2] NFSD: add support for sending CB_RECALL_ANY

Jeff Layton <jlayton@xxxxxxxxxx> · Tue, 25 Oct 2022 05:37:29 -0400

On Tue, 2022-10-25 at 05:33 -0400, Jeff Layton wrote:
> On Mon, 2022-10-24 at 18:30 -0700, dai.ngo@xxxxxxxxxx wrote:
> > On 10/24/22 2:09 PM, Jeff Layton wrote:
> > > On Mon, 2022-10-24 at 12:44 -0700, dai.ngo@xxxxxxxxxx wrote:
> > > > On 10/24/22 5:16 AM, Jeff Layton wrote:
> > > > > On Sat, 2022-10-22 at 11:09 -0700, Dai Ngo wrote:
> > > > > > There is only one nfsd4_callback, cl_recall_any, added for each
> > > > > > nfs4_client. Access to it must be serialized. For now it's done
> > > > > > by the cl_recall_any_busy flag since it's used only by the
> > > > > > delegation shrinker. If there is another consumer of CB_RECALL_ANY
> > > > > > then a spinlock must be used.
> > > > > > 
> > > > > > Signed-off-by: Dai Ngo <dai.ngo@xxxxxxxxxx>
> > > > > > ---
> > > > > >    fs/nfsd/nfs4callback.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >    fs/nfsd/nfs4state.c    | 27 +++++++++++++++++++++
> > > > > >    fs/nfsd/state.h        |  8 +++++++
> > > > > >    fs/nfsd/xdr4cb.h       |  6 +++++
> > > > > >    4 files changed, 105 insertions(+)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > index f0e69edf5f0f..03587e1397f4 100644
> > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > @@ -329,6 +329,29 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
> > > > > >    }
> > > > > >    
> > > > > >    /*
> > > > > > + * CB_RECALLANY4args
> > > > > > + *
> > > > > > + *	struct CB_RECALLANY4args {
> > > > > > + *		uint32_t	craa_objects_to_keep;
> > > > > > + *		bitmap4		craa_type_mask;
> > > > > > + *	};
> > > > > > + */
> > > > > > +static void
> > > > > > +encode_cb_recallany4args(struct xdr_stream *xdr,
> > > > > > +			struct nfs4_cb_compound_hdr *hdr, uint32_t bmval)
> > > > > > +{
> > > > > > +	__be32 *p;
> > > > > > +
> > > > > > +	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY);
> > > > > > +	p = xdr_reserve_space(xdr, 4);
> > > > > > +	*p++ = xdr_zero;	/* craa_objects_to_keep */
> > > > > > +	p = xdr_reserve_space(xdr, 8);
> > > > > > +	*p++ = cpu_to_be32(1);
> > > > > > +	*p++ = cpu_to_be32(bmval);
> > > > > > +	hdr->nops++;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > >     * CB_SEQUENCE4args
> > > > > >     *
> > > > > >     *	struct CB_SEQUENCE4args {
> > > > > > @@ -482,6 +505,24 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
> > > > > >    	encode_cb_nops(&hdr);
> > > > > >    }
> > > > > >    
> > > > > > +/*
> > > > > > + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
> > > > > > + */
> > > > > > +static void
> > > > > > +nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req,
> > > > > > +		struct xdr_stream *xdr, const void *data)
> > > > > > +{
> > > > > > +	const struct nfsd4_callback *cb = data;
> > > > > > +	struct nfs4_cb_compound_hdr hdr = {
> > > > > > +		.ident = cb->cb_clp->cl_cb_ident,
> > > > > > +		.minorversion = cb->cb_clp->cl_minorversion,
> > > > > > +	};
> > > > > > +
> > > > > > +	encode_cb_compound4args(xdr, &hdr);
> > > > > > +	encode_cb_sequence4args(xdr, cb, &hdr);
> > > > > > +	encode_cb_recallany4args(xdr, &hdr, cb->cb_clp->cl_recall_any_bm);
> > > > > > +	encode_cb_nops(&hdr);
> > > > > > +}
> > > > > >    
> > > > > >    /*
> > > > > >     * NFSv4.0 and NFSv4.1 XDR decode functions
> > > > > > @@ -520,6 +561,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
> > > > > >    	return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
> > > > > >    }
> > > > > >    
> > > > > > +/*
> > > > > > + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
> > > > > > + */
> > > > > > +static int
> > > > > > +nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp,
> > > > > > +				  struct xdr_stream *xdr,
> > > > > > +				  void *data)
> > > > > > +{
> > > > > > +	struct nfsd4_callback *cb = data;
> > > > > > +	struct nfs4_cb_compound_hdr hdr;
> > > > > > +	int status;
> > > > > > +
> > > > > > +	status = decode_cb_compound4res(xdr, &hdr);
> > > > > > +	if (unlikely(status))
> > > > > > +		return status;
> > > > > > +	status = decode_cb_sequence4res(xdr, cb);
> > > > > > +	if (unlikely(status || cb->cb_seq_status))
> > > > > > +		return status;
> > > > > > +	status =  decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status);
> > > > > > +	return status;
> > > > > > +}
> > > > > > +
> > > > > >    #ifdef CONFIG_NFSD_PNFS
> > > > > >    /*
> > > > > >     * CB_LAYOUTRECALL4args
> > > > > > @@ -783,6 +846,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
> > > > > >    #endif
> > > > > >    	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
> > > > > >    	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
> > > > > > +	PROC(CB_RECALL_ANY,	COMPOUND,	cb_recall_any,	cb_recall_any),
> > > > > >    };
> > > > > >    
> > > > > >    static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index 4e718500a00c..c60c937dece6 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -2854,6 +2854,31 @@ static const struct tree_descr client_files[] = {
> > > > > >    	[3] = {""},
> > > > > >    };
> > > > > >    
> > > > > > +static int
> > > > > > +nfsd4_cb_recall_any_done(struct nfsd4_callback *cb,
> > > > > > +			struct rpc_task *task)
> > > > > > +{
> > > > > > +	switch (task->tk_status) {
> > > > > > +	case -NFS4ERR_DELAY:
> > > > > > +		rpc_delay(task, 2 * HZ);
> > > > > > +		return 0;
> > > > > > +	default:
> > > > > > +		return 1;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
> > > > > > +{
> > > > > > +	cb->cb_clp->cl_recall_any_busy = false;
> > > > > > +	atomic_dec(&cb->cb_clp->cl_rpc_users);
> > > > > > +}
> > > > > This series probably ought to be one big patch. The problem is that
> > > > > you're adding a bunch of code to do CB_RECALL_ANY, but there is no way
> > > > > to call it without patch #2.
> > > > The reason I separated these patches is that the 1st patch, adding support
> > > > CB_RECALL_ANY can be called for other purposes than just for delegation such
> > > > as to recall pnfs layouts, or when the max number of delegation is reached.
> > > > So that was why I did not combined this patches. However, I understand your
> > > > concern about not being able to test individual patch. So as Chuck suggested,
> > > > perhaps we can leave these as separate patches for easier review and when it's
> > > > finalized we can decide to combine them in to one big patch.  BTW, I plan to
> > > > add a third patch to this series to send CB_RECALL_ANY to clients when the
> > > > max number of delegations is reached.
> > > > 
> > > I think we should get this bit sorted out first,
> > 
> > ok
> > 
> > >   but that sounds
> > > reasonable eventually.
> > > 
> > > > > That makes it hard to judge whether there could be races and locking
> > > > > issues around the handling of cb_recall_any_busy, in particular. From
> > > > > patch #2, it looks like cb_recall_any_busy is protected by the
> > > > > nn->client_lock, but I don't think ->release is called with that held.
> > > > I don't intended to use the nn->client_lock, I think the scope of this
> > > > lock is too big for what's needed to serialize access to struct nfsd4_callback.
> > > > As I mentioned in the cover email, since the cb_recall_any_busy is only
> > > > used by the deleg_reaper we do not need a lock to protect this flag.
> > > > But if there is another of consumer, other than deleg_reaper, of this
> > > > nfsd4_callback then we can add a simple spinlock for it.
> > > > 
> > > > My question is do you think we need to add the spinlock now instead of
> > > > delaying it until there is real need?
> > > > 
> > > I don't see the need for a dedicated spinlock here. You said above that
> > > there is only one of these per client, so you could use the
> > > client->cl_lock.
> > > 
> > > But...I don't see the problem with doing just using the nn->client_lock
> > > here. It's not like we're likely to be calling this that often, and if
> > > we do then the contention for the nn->client_lock is probably the least
> > > of our worries.
> > 
> > If the contention on nn->client_lock is not a concern then I just
> > leave the patch to use the nn->client_lock as it current does.
> > 
> 
> Except you aren't taking the client_lock in ->release. That's what needs
> to be added if you want to keep this boolean.
> 
> > > 
> > > Honestly, do we need this boolean at all? The only place it's checked is
> > > in deleg_reaper. Why not just try to submit the work and if it's already
> > > queued, let it fail?
> > 
> > There is nothing in the existing code to prevent the nfs4_callback from
> > being used again before the current CB_RECALL_ANY request completes. This
> > resulted in se_cb_seq_nr becomes out of sync with the client and server
> > starts getting NFS4ERR_SEQ_MISORDERED then eventually NFS4ERR_BADSESSION
> > from the client.
> > 
> > nfsd4_recall_file_layout has similar usage of nfs4_callback and it uses
> > the ls_lock to make sure the current request is done before allowing new
> > one to proceed.
> > 
> 
> That's a little different. The ls_lock protects ls_recalled, which is
> set when a recall is issued. We only want to issue a recall for a
> delegation once and that's what ensures that it's only issued once.
> 
> CB_RECALL_ANY could be called more than once per client. We don't need
> to ensure "exactly once" semantics there. All of the callbacks are run
> out of workqueues.
> 
> If a workqueue job is already scheduled then queue_work will return
> false when called. Could you just do away with this boolean and rely on
> the return code from queue_work to ensure that it doesn't get scheduled
> too often?
> 
> > > 
> > > > > Also, cl_rpc_users is a refcount (though we don't necessarily free the
> > > > > object when it goes to zero). I think you need to call
> > > > > put_client_renew_locked here instead of just decrementing the counter.
> > > > Since put_client_renew_locked() also renews the client lease, I don't
> > > > think it's right nfsd4_cb_recall_any_release to renew the lease because
> > > > because this is a callback so the client is not actually sending any
> > > > request that causes the lease to renewed, and nfsd4_cb_recall_any_release
> > > > is also alled even if the client is completely dead and did not reply, or
> > > > reply with some errors.
> > > > 
> > > What happens when this atomic_inc makes the cl_rpc_count go to zero?
> > 
> > Do you mean atomic_dec of cl_rpc_users?
> > 
> 
> Yes, sorry.
> 
> > > What actually triggers the cleanup activities in put_client_renew /
> > > put_client_renew_locked in that situation?
> > 
> > maybe I'm missing something, but I don't see any client cleanup
> > in put_client_renew/put_client_renew_locked other than renewing
> > the lease?
> > 
> > 
> 
>         if (!is_client_expired(clp))
>                 renew_client_locked(clp);
>         else                                         
>                 wake_up_all(&expiry_wq);
> 
> 
> ...unless the client has already expired, in which case you need to wake
> up the waitqueue. My guess is that if the atomic_dec you're calling here
> goes to zero then any tasks on the expiry_wq will hang indefinitely.
> 

I'm not sure you need to take a reference to the client here at all. Can
the client go away while a callback job is still running? You may be
able to assume that it will stick around for the life of the callback
(though you should verify this before assuming it).

> > > 
> > > > > > +
> > > > > > +static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
> > > > > > +	.done		= nfsd4_cb_recall_any_done,
> > > > > > +	.release	= nfsd4_cb_recall_any_release,
> > > > > > +};
> > > > > > +
> > > > > >    static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > >    		struct svc_rqst *rqstp, nfs4_verifier *verf)
> > > > > >    {
> > > > > > @@ -2891,6 +2916,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > >    		free_client(clp);
> > > > > >    		return NULL;
> > > > > >    	}
> > > > > > +	nfsd4_init_cb(&clp->cl_recall_any, clp, &nfsd4_cb_recall_any_ops,
> > > > > > +			NFSPROC4_CLNT_CB_RECALL_ANY);
> > > > > >    	return clp;
> > > > > >    }
> > > > > >    
> > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > index e2daef3cc003..49ca06169642 100644
> > > > > > --- a/fs/nfsd/state.h
> > > > > > +++ b/fs/nfsd/state.h
> > > > > > @@ -411,6 +411,10 @@ struct nfs4_client {
> > > > > >    
> > > > > >    	unsigned int		cl_state;
> > > > > >    	atomic_t		cl_delegs_in_recall;
> > > > > > +
> > > > > > +	bool			cl_recall_any_busy;
> > > > > > +	uint32_t		cl_recall_any_bm;
> > > > > > +	struct nfsd4_callback	cl_recall_any;
> > > > > >    };
> > > > > >    
> > > > > >    /* struct nfs4_client_reset
> > > > > > @@ -639,8 +643,12 @@ enum nfsd4_cb_op {
> > > > > >    	NFSPROC4_CLNT_CB_OFFLOAD,
> > > > > >    	NFSPROC4_CLNT_CB_SEQUENCE,
> > > > > >    	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
> > > > > > +	NFSPROC4_CLNT_CB_RECALL_ANY,
> > > > > >    };
> > > > > >    
> > > > > > +#define RCA4_TYPE_MASK_RDATA_DLG	0
> > > > > > +#define RCA4_TYPE_MASK_WDATA_DLG	1
> > > > > > +
> > > > > >    /* Returns true iff a is later than b: */
> > > > > >    static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
> > > > > >    {
> > > > > > diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
> > > > > > index 547cf07cf4e0..0d39af1b00a0 100644
> > > > > > --- a/fs/nfsd/xdr4cb.h
> > > > > > +++ b/fs/nfsd/xdr4cb.h
> > > > > > @@ -48,3 +48,9 @@
> > > > > >    #define NFS4_dec_cb_offload_sz		(cb_compound_dec_hdr_sz  +      \
> > > > > >    					cb_sequence_dec_sz +            \
> > > > > >    					op_dec_sz)
> > > > > > +#define NFS4_enc_cb_recall_any_sz	(cb_compound_enc_hdr_sz +       \
> > > > > > +					cb_sequence_enc_sz +            \
> > > > > > +					1 + 1 + 1)
> > > > > > +#define NFS4_dec_cb_recall_any_sz	(cb_compound_dec_hdr_sz  +      \
> > > > > > +					cb_sequence_dec_sz +            \
> > > > > > +					op_dec_sz)
> 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>