Re: [PATCH v3 8/9] SUNRPC: Replace sp_threads_all with an xarray

Chuck Lever III <chuck.lever@xxxxxxxxxx> · Tue, 11 Jul 2023 16:27:21 +0000

> On Jul 11, 2023, at 11:05 AM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> 
> On Tue, 2023-07-11 at 12:39 +0000, Chuck Lever III wrote:
>> 
>>> On Jul 11, 2023, at 7:57 AM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>>> 
>>> On Mon, 2023-07-10 at 22:24 -0400, Chuck Lever wrote:
>>>> On Mon, Jul 10, 2023 at 09:06:02PM -0400, Jeff Layton wrote:
>>>>> On Tue, 2023-07-11 at 00:58 +0000, Chuck Lever III wrote:
>>>>>> 
>>>>>>> On Jul 10, 2023, at 2:24 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>>>>>>> 
>>>>>>> On Mon, 2023-07-10 at 12:42 -0400, Chuck Lever wrote:
>>>>>>>> From: Chuck Lever <chuck.lever@xxxxxxxxxx>
>>>>>>>> 
>>>>>>>> We want a thread lookup operation that can be done with RCU only,
>>>>>>>> but also we want to avoid the linked-list walk, which does not scale
>>>>>>>> well in the number of pool threads.
>>>>>>>> 
>>>>>>>> This patch splits out the use of the sp_lock to protect the set
>>>>>>>> of threads. Svc thread information is now protected by the xarray's
>>>>>>>> lock (when making thread count changes) and the RCU read lock (when
>>>>>>>> only looking up a thread).
>>>>>>>> 
>>>>>>>> Since thread count changes are done only via nfsd filesystem API,
>>>>>>>> which runs only in process context, we can safely dispense with the
>>>>>>>> use of a bottom-half-disabled lock.
>>>>>>>> 
>>>>>>>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
>>>>>>>> ---
>>>>>>>> fs/nfsd/nfssvc.c              |    3 +-
>>>>>>>> include/linux/sunrpc/svc.h    |   11 +++----
>>>>>>>> include/trace/events/sunrpc.h |   47 ++++++++++++++++++++++++++++-
>>>>>>>> net/sunrpc/svc.c              |   67 +++++++++++++++++++++++++----------------
>>>>>>>> net/sunrpc/svc_xprt.c         |    2 +
>>>>>>>> 5 files changed, 94 insertions(+), 36 deletions(-)
>>>>>>>> 
>>>>>>>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>>>>>>>> index 2154fa63c5f2..d42b2a40c93c 100644
>>>>>>>> --- a/fs/nfsd/nfssvc.c
>>>>>>>> +++ b/fs/nfsd/nfssvc.c
>>>>>>>> @@ -62,8 +62,7 @@ static __be32 nfsd_init_request(struct svc_rqst *,
>>>>>>>> * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
>>>>>>>> * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless
>>>>>>>> * nn->keep_active is set).  That number of nfsd threads must
>>>>>>>> - * exist and each must be listed in ->sp_all_threads in some entry of
>>>>>>>> - * ->sv_pools[].
>>>>>>>> + * exist and each must be listed in some entry of ->sv_pools[].
>>>>>>>> *
>>>>>>>> * Each active thread holds a counted reference on nn->nfsd_serv, as does
>>>>>>>> * the nn->keep_active flag and various transient calls to svc_get().
>>>>>>>> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>>>>>>>> index 9dd3b16cc4c2..86377506a514 100644
>>>>>>>> --- a/include/linux/sunrpc/svc.h
>>>>>>>> +++ b/include/linux/sunrpc/svc.h
>>>>>>>> @@ -32,10 +32,10 @@
>>>>>>>> */
>>>>>>>> struct svc_pool {
>>>>>>>> unsigned int sp_id;     /* pool id; also node id on NUMA */
>>>>>>>> - spinlock_t sp_lock; /* protects all fields */
>>>>>>>> + spinlock_t sp_lock; /* protects sp_sockets */
>>>>>>>> struct list_head sp_sockets; /* pending sockets */
>>>>>>>> unsigned int sp_nrthreads; /* # of threads in pool */
>>>>>>>> - struct list_head sp_all_threads; /* all server threads */
>>>>>>>> + struct xarray sp_thread_xa;
>>>>>>>> 
>>>>>>>> /* statistics on pool operation */
>>>>>>>> struct percpu_counter sp_messages_arrived;
>>>>>>>> @@ -196,7 +196,6 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
>>>>>>>> * processed.
>>>>>>>> */
>>>>>>>> struct svc_rqst {
>>>>>>>> - struct list_head rq_all; /* all threads list */
>>>>>>>> struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
>>>>>>>> struct svc_xprt * rq_xprt; /* transport ptr */
>>>>>>>> 
>>>>>>>> @@ -241,10 +240,10 @@ struct svc_rqst {
>>>>>>>> #define RQ_SPLICE_OK (4) /* turned off in gss privacy
>>>>>>>> * to prevent encrypting page
>>>>>>>> * cache pages */
>>>>>>>> -#define RQ_VICTIM (5) /* about to be shut down */
>>>>>>>> -#define RQ_BUSY (6) /* request is busy */
>>>>>>>> -#define RQ_DATA (7) /* request has data */
>>>>>>>> +#define RQ_BUSY (5) /* request is busy */
>>>>>>>> +#define RQ_DATA (6) /* request has data */
>>>>>>>> unsigned long rq_flags; /* flags field */
>>>>>>>> + u32 rq_thread_id; /* xarray index */
>>>>>>>> ktime_t rq_qtime; /* enqueue time */
>>>>>>>> 
>>>>>>>> void * rq_argp; /* decoded arguments */
>>>>>>>> diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
>>>>>>>> index 60c8e03268d4..ea43c6059bdb 100644
>>>>>>>> --- a/include/trace/events/sunrpc.h
>>>>>>>> +++ b/include/trace/events/sunrpc.h
>>>>>>>> @@ -1676,7 +1676,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto);
>>>>>>>> svc_rqst_flag(USEDEFERRAL) \
>>>>>>>> svc_rqst_flag(DROPME) \
>>>>>>>> svc_rqst_flag(SPLICE_OK) \
>>>>>>>> - svc_rqst_flag(VICTIM) \
>>>>>>>> svc_rqst_flag(BUSY) \
>>>>>>>> svc_rqst_flag_end(DATA)
>>>>>>>> 
>>>>>>>> @@ -2118,6 +2117,52 @@ TRACE_EVENT(svc_pool_starved,
>>>>>>>> )
>>>>>>>> );
>>>>>>>> 
>>>>>>>> +DECLARE_EVENT_CLASS(svc_thread_lifetime_class,
>>>>>>>> + TP_PROTO(
>>>>>>>> + const struct svc_serv *serv,
>>>>>>>> + const struct svc_pool *pool,
>>>>>>>> + const struct svc_rqst *rqstp
>>>>>>>> + ),
>>>>>>>> +
>>>>>>>> + TP_ARGS(serv, pool, rqstp),
>>>>>>>> +
>>>>>>>> + TP_STRUCT__entry(
>>>>>>>> + __string(name, serv->sv_name)
>>>>>>>> + __field(int, pool_id)
>>>>>>>> + __field(unsigned int, nrthreads)
>>>>>>>> + __field(unsigned long, pool_flags)
>>>>>>>> + __field(u32, thread_id)
>>>>>>>> + __field(const void *, rqstp)
>>>>>>>> + ),
>>>>>>>> +
>>>>>>>> + TP_fast_assign(
>>>>>>>> + __assign_str(name, serv->sv_name);
>>>>>>>> + __entry->pool_id = pool->sp_id;
>>>>>>>> + __entry->nrthreads = pool->sp_nrthreads;
>>>>>>>> + __entry->pool_flags = pool->sp_flags;
>>>>>>>> + __entry->thread_id = rqstp->rq_thread_id;
>>>>>>>> + __entry->rqstp = rqstp;
>>>>>>>> + ),
>>>>>>>> +
>>>>>>>> + TP_printk("service=%s pool=%d pool_flags=%s nrthreads=%u thread_id=%u",
>>>>>>>> + __get_str(name), __entry->pool_id,
>>>>>>>> + show_svc_pool_flags(__entry->pool_flags),
>>>>>>>> + __entry->nrthreads, __entry->thread_id
>>>>>>>> + )
>>>>>>>> +);
>>>>>>>> +
>>>>>>>> +#define DEFINE_SVC_THREAD_LIFETIME_EVENT(name) \
>>>>>>>> + DEFINE_EVENT(svc_thread_lifetime_class, svc_pool_##name, \
>>>>>>>> + TP_PROTO( \
>>>>>>>> + const struct svc_serv *serv, \
>>>>>>>> + const struct svc_pool *pool, \
>>>>>>>> + const struct svc_rqst *rqstp \
>>>>>>>> + ), \
>>>>>>>> + TP_ARGS(serv, pool, rqstp))
>>>>>>>> +
>>>>>>>> +DEFINE_SVC_THREAD_LIFETIME_EVENT(thread_init);
>>>>>>>> +DEFINE_SVC_THREAD_LIFETIME_EVENT(thread_exit);
>>>>>>>> +
>>>>>>>> DECLARE_EVENT_CLASS(svc_xprt_event,
>>>>>>>> TP_PROTO(
>>>>>>>> const struct svc_xprt *xprt
>>>>>>>> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>>>>>>>> index ad29df00b454..109d7f047385 100644
>>>>>>>> --- a/net/sunrpc/svc.c
>>>>>>>> +++ b/net/sunrpc/svc.c
>>>>>>>> @@ -507,8 +507,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
>>>>>>>> 
>>>>>>>> pool->sp_id = i;
>>>>>>>> INIT_LIST_HEAD(&pool->sp_sockets);
>>>>>>>> - INIT_LIST_HEAD(&pool->sp_all_threads);
>>>>>>>> spin_lock_init(&pool->sp_lock);
>>>>>>>> + xa_init_flags(&pool->sp_thread_xa, XA_FLAGS_ALLOC);
>>>>>>>> 
>>>>>>>> percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL);
>>>>>>>> percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL);
>>>>>>>> @@ -596,6 +596,8 @@ svc_destroy(struct kref *ref)
>>>>>>>> percpu_counter_destroy(&pool->sp_threads_timedout);
>>>>>>>> percpu_counter_destroy(&pool->sp_threads_starved);
>>>>>>>> percpu_counter_destroy(&pool->sp_threads_no_work);
>>>>>>>> +
>>>>>>>> + xa_destroy(&pool->sp_thread_xa);
>>>>>>>> }
>>>>>>>> kfree(serv->sv_pools);
>>>>>>>> kfree(serv);
>>>>>>>> @@ -676,7 +678,11 @@ EXPORT_SYMBOL_GPL(svc_rqst_alloc);
>>>>>>>> static struct svc_rqst *
>>>>>>>> svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>>>>>>>> {
>>>>>>>> + struct xa_limit limit = {
>>>>>>>> + .max = U32_MAX,
>>>>>>>> + };
>>>>>>>> struct svc_rqst *rqstp;
>>>>>>>> + int ret;
>>>>>>>> 
>>>>>>>> rqstp = svc_rqst_alloc(serv, pool, node);
>>>>>>>> if (!rqstp)
>>>>>>>> @@ -687,11 +693,21 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>>>>>>>> serv->sv_nrthreads += 1;
>>>>>>>> spin_unlock_bh(&serv->sv_lock);
>>>>>>>> 
>>>>>>>> - spin_lock_bh(&pool->sp_lock);
>>>>>>>> + xa_lock(&pool->sp_thread_xa);
>>>>>>>> + ret = __xa_alloc(&pool->sp_thread_xa, &rqstp->rq_thread_id, rqstp,
>>>>>>>> + limit, GFP_KERNEL);
>>>>>>>> + if (ret) {
>>>>>>>> + xa_unlock(&pool->sp_thread_xa);
>>>>>>>> + goto out_free;
>>>>>>>> + }
>>>>>>>> pool->sp_nrthreads++;
>>>>>>>> - list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
>>>>>>>> - spin_unlock_bh(&pool->sp_lock);
>>>>>>>> + xa_unlock(&pool->sp_thread_xa);
>>>>>>>> + trace_svc_pool_thread_init(serv, pool, rqstp);
>>>>>>>> return rqstp;
>>>>>>>> +
>>>>>>>> +out_free:
>>>>>>>> + svc_rqst_free(rqstp);
>>>>>>>> + return ERR_PTR(ret);
>>>>>>>> }
>>>>>>>> 
>>>>>>>> /**
>>>>>>>> @@ -708,19 +724,17 @@ struct svc_rqst *svc_pool_wake_idle_thread(struct svc_serv *serv,
>>>>>>>> struct svc_pool *pool)
>>>>>>>> {
>>>>>>>> struct svc_rqst *rqstp;
>>>>>>>> + unsigned long index;
>>>>>>>> 
>>>>>>>> - rcu_read_lock();
>>>>>>> 
>>>>>>> 
>>>>>>> While it does do its own locking, the resulting object that xa_for_each
>>>>>>> returns needs some protection too. Between xa_for_each returning a rqstp
>>>>>>> and calling test_and_set_bit, could the rqstp be freed? I suspect so,
>>>>>>> and I think you probably need to keep the rcu_read_lock() call above.
>>>>>> 
>>>>>> Should I keep the rcu_read_lock() even with the bitmap/xa_load
>>>>>> version of svc_pool_wake_idle_thread() in 9/9 ?
>>>>>> 
>>>>> 
>>>>> Yeah, I think you have to. We're not doing real locking on the search or
>>>>> taking references, so nothing else will ensure that the rqstp will stick
>>>>> around once you've found it. I think you have to hold it until after
>>>>> wake_up_process (at least).
>>>> 
>>>> I can keep the RCU read lock around the search and xa_load(). But
>>>> I notice that the code we're replacing releases the RCU read lock
>>>> before calling wake_up_process(). Not saying that's right, but we
>>>> haven't had a problem reported.
>>>> 
>>>> 
>>> 
>>> Understood. Given that we're not sleeping in that section, it's quite
>>> possible that the RCU callbacks just never have a chance to run before
>>> we wake the thing up and so you never hit the problem.
>>> 
>>> Still, I think it'd be best to just keep the rcu_read_lock around that
>>> whole block. It's relatively cheap and safe to take it recursively, and
>>> that makes it explicit that the found rqst mustn't vanish before the
>>> wakeup is done.
>> 
>> My point is that since the existing code doesn't hold the
>> RCU read lock for the wake_up_process() call, either it's
>> unnecessary or the existing code is broken and needs a
>> back-portable fix.
>> 
>> Do you think the existing code is broken?
>> 
> 
> Actually, it varies. svc_xprt_enqueue calls wake_up_process with the
> rcu_read_lock held. svc_wake_up doesn't though.

Good catch. I'll change 1/9 to hold the RCU read lock until
after the wake_up_process() call, and then leave that pattern
in place through the rest of the series.

1/9 will be straightforward to backport if that should be
necessary.

> So yes, I think svc_wake_up is broken, though I imagine this is hard to
> hit outside of some very specific circumstances. Here is the existing
> svc_wake_up block:
> 
>        rcu_read_lock();
>        list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
>                /* skip any that aren't queued */
>                if (test_bit(RQ_BUSY, &rqstp->rq_flags))
>                        continue;
>                rcu_read_unlock();
>                wake_up_process(rqstp->rq_task);
>                trace_svc_wake_up(rqstp->rq_task->pid);
>                return;
>        }
>        rcu_read_unlock();
> 
> Once rcu_read_unlock is called, rqstp could technically be freed before
> we go to dereference the pointer. I don't see anything that prevents
> that from occurring, though you'd have to be doing this while the
> service's thread count is being reduced (which would cause entries to be
> freed).
> 
> Worth fixing, IMO, but probably not worth sending to stable.
> -- 
> Jeff Layton <jlayton@xxxxxxxxxx>

--
Chuck Lever