> On Jul 11, 2023, at 11:05 AM, Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Tue, 2023-07-11 at 12:39 +0000, Chuck Lever III wrote: >> >>> On Jul 11, 2023, at 7:57 AM, Jeff Layton <jlayton@xxxxxxxxxx> wrote: >>> >>> On Mon, 2023-07-10 at 22:24 -0400, Chuck Lever wrote: >>>> On Mon, Jul 10, 2023 at 09:06:02PM -0400, Jeff Layton wrote: >>>>> On Tue, 2023-07-11 at 00:58 +0000, Chuck Lever III wrote: >>>>>> >>>>>>> On Jul 10, 2023, at 2:24 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote: >>>>>>> >>>>>>> On Mon, 2023-07-10 at 12:42 -0400, Chuck Lever wrote: >>>>>>>> From: Chuck Lever <chuck.lever@xxxxxxxxxx> >>>>>>>> >>>>>>>> We want a thread lookup operation that can be done with RCU only, >>>>>>>> but also we want to avoid the linked-list walk, which does not scale >>>>>>>> well in the number of pool threads. >>>>>>>> >>>>>>>> This patch splits out the use of the sp_lock to protect the set >>>>>>>> of threads. Svc thread information is now protected by the xarray's >>>>>>>> lock (when making thread count changes) and the RCU read lock (when >>>>>>>> only looking up a thread). >>>>>>>> >>>>>>>> Since thread count changes are done only via nfsd filesystem API, >>>>>>>> which runs only in process context, we can safely dispense with the >>>>>>>> use of a bottom-half-disabled lock. >>>>>>>> >>>>>>>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >>>>>>>> --- >>>>>>>> fs/nfsd/nfssvc.c | 3 +- >>>>>>>> include/linux/sunrpc/svc.h | 11 +++---- >>>>>>>> include/trace/events/sunrpc.h | 47 ++++++++++++++++++++++++++++- >>>>>>>> net/sunrpc/svc.c | 67 +++++++++++++++++++++++++---------------- >>>>>>>> net/sunrpc/svc_xprt.c | 2 + >>>>>>>> 5 files changed, 94 insertions(+), 36 deletions(-) >>>>>>>> >>>>>>>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c >>>>>>>> index 2154fa63c5f2..d42b2a40c93c 100644 >>>>>>>> --- a/fs/nfsd/nfssvc.c >>>>>>>> +++ b/fs/nfsd/nfssvc.c >>>>>>>> @@ -62,8 +62,7 @@ static __be32 nfsd_init_request(struct svc_rqst *, >>>>>>>> * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a >>>>>>>> * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless >>>>>>>> * nn->keep_active is set). That number of nfsd threads must >>>>>>>> - * exist and each must be listed in ->sp_all_threads in some entry of >>>>>>>> - * ->sv_pools[]. >>>>>>>> + * exist and each must be listed in some entry of ->sv_pools[]. >>>>>>>> * >>>>>>>> * Each active thread holds a counted reference on nn->nfsd_serv, as does >>>>>>>> * the nn->keep_active flag and various transient calls to svc_get(). >>>>>>>> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h >>>>>>>> index 9dd3b16cc4c2..86377506a514 100644 >>>>>>>> --- a/include/linux/sunrpc/svc.h >>>>>>>> +++ b/include/linux/sunrpc/svc.h >>>>>>>> @@ -32,10 +32,10 @@ >>>>>>>> */ >>>>>>>> struct svc_pool { >>>>>>>> unsigned int sp_id; /* pool id; also node id on NUMA */ >>>>>>>> - spinlock_t sp_lock; /* protects all fields */ >>>>>>>> + spinlock_t sp_lock; /* protects sp_sockets */ >>>>>>>> struct list_head sp_sockets; /* pending sockets */ >>>>>>>> unsigned int sp_nrthreads; /* # of threads in pool */ >>>>>>>> - struct list_head sp_all_threads; /* all server threads */ >>>>>>>> + struct xarray sp_thread_xa; >>>>>>>> >>>>>>>> /* statistics on pool operation */ >>>>>>>> struct percpu_counter sp_messages_arrived; >>>>>>>> @@ -196,7 +196,6 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); >>>>>>>> * processed. >>>>>>>> */ >>>>>>>> struct svc_rqst { >>>>>>>> - struct list_head rq_all; /* all threads list */ >>>>>>>> struct rcu_head rq_rcu_head; /* for RCU deferred kfree */ >>>>>>>> struct svc_xprt * rq_xprt; /* transport ptr */ >>>>>>>> >>>>>>>> @@ -241,10 +240,10 @@ struct svc_rqst { >>>>>>>> #define RQ_SPLICE_OK (4) /* turned off in gss privacy >>>>>>>> * to prevent encrypting page >>>>>>>> * cache pages */ >>>>>>>> -#define RQ_VICTIM (5) /* about to be shut down */ >>>>>>>> -#define RQ_BUSY (6) /* request is busy */ >>>>>>>> -#define RQ_DATA (7) /* request has data */ >>>>>>>> +#define RQ_BUSY (5) /* request is busy */ >>>>>>>> +#define RQ_DATA (6) /* request has data */ >>>>>>>> unsigned long rq_flags; /* flags field */ >>>>>>>> + u32 rq_thread_id; /* xarray index */ >>>>>>>> ktime_t rq_qtime; /* enqueue time */ >>>>>>>> >>>>>>>> void * rq_argp; /* decoded arguments */ >>>>>>>> diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h >>>>>>>> index 60c8e03268d4..ea43c6059bdb 100644 >>>>>>>> --- a/include/trace/events/sunrpc.h >>>>>>>> +++ b/include/trace/events/sunrpc.h >>>>>>>> @@ -1676,7 +1676,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto); >>>>>>>> svc_rqst_flag(USEDEFERRAL) \ >>>>>>>> svc_rqst_flag(DROPME) \ >>>>>>>> svc_rqst_flag(SPLICE_OK) \ >>>>>>>> - svc_rqst_flag(VICTIM) \ >>>>>>>> svc_rqst_flag(BUSY) \ >>>>>>>> svc_rqst_flag_end(DATA) >>>>>>>> >>>>>>>> @@ -2118,6 +2117,52 @@ TRACE_EVENT(svc_pool_starved, >>>>>>>> ) >>>>>>>> ); >>>>>>>> >>>>>>>> +DECLARE_EVENT_CLASS(svc_thread_lifetime_class, >>>>>>>> + TP_PROTO( >>>>>>>> + const struct svc_serv *serv, >>>>>>>> + const struct svc_pool *pool, >>>>>>>> + const struct svc_rqst *rqstp >>>>>>>> + ), >>>>>>>> + >>>>>>>> + TP_ARGS(serv, pool, rqstp), >>>>>>>> + >>>>>>>> + TP_STRUCT__entry( >>>>>>>> + __string(name, serv->sv_name) >>>>>>>> + __field(int, pool_id) >>>>>>>> + __field(unsigned int, nrthreads) >>>>>>>> + __field(unsigned long, pool_flags) >>>>>>>> + __field(u32, thread_id) >>>>>>>> + __field(const void *, rqstp) >>>>>>>> + ), >>>>>>>> + >>>>>>>> + TP_fast_assign( >>>>>>>> + __assign_str(name, serv->sv_name); >>>>>>>> + __entry->pool_id = pool->sp_id; >>>>>>>> + __entry->nrthreads = pool->sp_nrthreads; >>>>>>>> + __entry->pool_flags = pool->sp_flags; >>>>>>>> + __entry->thread_id = rqstp->rq_thread_id; >>>>>>>> + __entry->rqstp = rqstp; >>>>>>>> + ), >>>>>>>> + >>>>>>>> + TP_printk("service=%s pool=%d pool_flags=%s nrthreads=%u thread_id=%u", >>>>>>>> + __get_str(name), __entry->pool_id, >>>>>>>> + show_svc_pool_flags(__entry->pool_flags), >>>>>>>> + __entry->nrthreads, __entry->thread_id >>>>>>>> + ) >>>>>>>> +); >>>>>>>> + >>>>>>>> +#define DEFINE_SVC_THREAD_LIFETIME_EVENT(name) \ >>>>>>>> + DEFINE_EVENT(svc_thread_lifetime_class, svc_pool_##name, \ >>>>>>>> + TP_PROTO( \ >>>>>>>> + const struct svc_serv *serv, \ >>>>>>>> + const struct svc_pool *pool, \ >>>>>>>> + const struct svc_rqst *rqstp \ >>>>>>>> + ), \ >>>>>>>> + TP_ARGS(serv, pool, rqstp)) >>>>>>>> + >>>>>>>> +DEFINE_SVC_THREAD_LIFETIME_EVENT(thread_init); >>>>>>>> +DEFINE_SVC_THREAD_LIFETIME_EVENT(thread_exit); >>>>>>>> + >>>>>>>> DECLARE_EVENT_CLASS(svc_xprt_event, >>>>>>>> TP_PROTO( >>>>>>>> const struct svc_xprt *xprt >>>>>>>> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c >>>>>>>> index ad29df00b454..109d7f047385 100644 >>>>>>>> --- a/net/sunrpc/svc.c >>>>>>>> +++ b/net/sunrpc/svc.c >>>>>>>> @@ -507,8 +507,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, >>>>>>>> >>>>>>>> pool->sp_id = i; >>>>>>>> INIT_LIST_HEAD(&pool->sp_sockets); >>>>>>>> - INIT_LIST_HEAD(&pool->sp_all_threads); >>>>>>>> spin_lock_init(&pool->sp_lock); >>>>>>>> + xa_init_flags(&pool->sp_thread_xa, XA_FLAGS_ALLOC); >>>>>>>> >>>>>>>> percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL); >>>>>>>> percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); >>>>>>>> @@ -596,6 +596,8 @@ svc_destroy(struct kref *ref) >>>>>>>> percpu_counter_destroy(&pool->sp_threads_timedout); >>>>>>>> percpu_counter_destroy(&pool->sp_threads_starved); >>>>>>>> percpu_counter_destroy(&pool->sp_threads_no_work); >>>>>>>> + >>>>>>>> + xa_destroy(&pool->sp_thread_xa); >>>>>>>> } >>>>>>>> kfree(serv->sv_pools); >>>>>>>> kfree(serv); >>>>>>>> @@ -676,7 +678,11 @@ EXPORT_SYMBOL_GPL(svc_rqst_alloc); >>>>>>>> static struct svc_rqst * >>>>>>>> svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) >>>>>>>> { >>>>>>>> + struct xa_limit limit = { >>>>>>>> + .max = U32_MAX, >>>>>>>> + }; >>>>>>>> struct svc_rqst *rqstp; >>>>>>>> + int ret; >>>>>>>> >>>>>>>> rqstp = svc_rqst_alloc(serv, pool, node); >>>>>>>> if (!rqstp) >>>>>>>> @@ -687,11 +693,21 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) >>>>>>>> serv->sv_nrthreads += 1; >>>>>>>> spin_unlock_bh(&serv->sv_lock); >>>>>>>> >>>>>>>> - spin_lock_bh(&pool->sp_lock); >>>>>>>> + xa_lock(&pool->sp_thread_xa); >>>>>>>> + ret = __xa_alloc(&pool->sp_thread_xa, &rqstp->rq_thread_id, rqstp, >>>>>>>> + limit, GFP_KERNEL); >>>>>>>> + if (ret) { >>>>>>>> + xa_unlock(&pool->sp_thread_xa); >>>>>>>> + goto out_free; >>>>>>>> + } >>>>>>>> pool->sp_nrthreads++; >>>>>>>> - list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); >>>>>>>> - spin_unlock_bh(&pool->sp_lock); >>>>>>>> + xa_unlock(&pool->sp_thread_xa); >>>>>>>> + trace_svc_pool_thread_init(serv, pool, rqstp); >>>>>>>> return rqstp; >>>>>>>> + >>>>>>>> +out_free: >>>>>>>> + svc_rqst_free(rqstp); >>>>>>>> + return ERR_PTR(ret); >>>>>>>> } >>>>>>>> >>>>>>>> /** >>>>>>>> @@ -708,19 +724,17 @@ struct svc_rqst *svc_pool_wake_idle_thread(struct svc_serv *serv, >>>>>>>> struct svc_pool *pool) >>>>>>>> { >>>>>>>> struct svc_rqst *rqstp; >>>>>>>> + unsigned long index; >>>>>>>> >>>>>>>> - rcu_read_lock(); >>>>>>> >>>>>>> >>>>>>> While it does do its own locking, the resulting object that xa_for_each >>>>>>> returns needs some protection too. Between xa_for_each returning a rqstp >>>>>>> and calling test_and_set_bit, could the rqstp be freed? I suspect so, >>>>>>> and I think you probably need to keep the rcu_read_lock() call above. >>>>>> >>>>>> Should I keep the rcu_read_lock() even with the bitmap/xa_load >>>>>> version of svc_pool_wake_idle_thread() in 9/9 ? >>>>>> >>>>> >>>>> Yeah, I think you have to. We're not doing real locking on the search or >>>>> taking references, so nothing else will ensure that the rqstp will stick >>>>> around once you've found it. I think you have to hold it until after >>>>> wake_up_process (at least). >>>> >>>> I can keep the RCU read lock around the search and xa_load(). But >>>> I notice that the code we're replacing releases the RCU read lock >>>> before calling wake_up_process(). Not saying that's right, but we >>>> haven't had a problem reported. >>>> >>>> >>> >>> Understood. Given that we're not sleeping in that section, it's quite >>> possible that the RCU callbacks just never have a chance to run before >>> we wake the thing up and so you never hit the problem. >>> >>> Still, I think it'd be best to just keep the rcu_read_lock around that >>> whole block. It's relatively cheap and safe to take it recursively, and >>> that makes it explicit that the found rqst mustn't vanish before the >>> wakeup is done. >> >> My point is that since the existing code doesn't hold the >> RCU read lock for the wake_up_process() call, either it's >> unnecessary or the existing code is broken and needs a >> back-portable fix. >> >> Do you think the existing code is broken? >> > > Actually, it varies. svc_xprt_enqueue calls wake_up_process with the > rcu_read_lock held. svc_wake_up doesn't though. Good catch. I'll change 1/9 to hold the RCU read lock until after the wake_up_process() call, and then leave that pattern in place through the rest of the series. 1/9 will be straightforward to backport if that should be necessary. > So yes, I think svc_wake_up is broken, though I imagine this is hard to > hit outside of some very specific circumstances. Here is the existing > svc_wake_up block: > > rcu_read_lock(); > list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { > /* skip any that aren't queued */ > if (test_bit(RQ_BUSY, &rqstp->rq_flags)) > continue; > rcu_read_unlock(); > wake_up_process(rqstp->rq_task); > trace_svc_wake_up(rqstp->rq_task->pid); > return; > } > rcu_read_unlock(); > > Once rcu_read_unlock is called, rqstp could technically be freed before > we go to dereference the pointer. I don't see anything that prevents > that from occurring, though you'd have to be doing this while the > service's thread count is being reduced (which would cause entries to be > freed). > > Worth fixing, IMO, but probably not worth sending to stable. > -- > Jeff Layton <jlayton@xxxxxxxxxx> -- Chuck Lever