From: Chuck Lever <chuck.lever@xxxxxxxxxx> We want a thread lookup operation that can be done with RCU only, but to avoid the linked-list walk, which does not scale well in the number of svc threads. BH-disabled locking is no longer necessary because we're no longer sharing the pool's sp_lock to protect either the xarray or the pool's thread count. sp_lock also protects transport activity. As far as I can tell, there are no callers of svc_set_num_threads() that run outside of process context. Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- fs/nfsd/nfssvc.c | 3 +- include/linux/sunrpc/svc.h | 11 +++---- include/trace/events/sunrpc.h | 47 +++++++++++++++++++++++++++++- net/sunrpc/svc.c | 65 +++++++++++++++++++++++++---------------- net/sunrpc/svc_xprt.c | 2 + 5 files changed, 92 insertions(+), 36 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2154fa63c5f2..d42b2a40c93c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -62,8 +62,7 @@ static __be32 nfsd_init_request(struct svc_rqst *, * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless * nn->keep_active is set). That number of nfsd threads must - * exist and each must be listed in ->sp_all_threads in some entry of - * ->sv_pools[]. + * exist and each must be listed in some entry of ->sv_pools[]. * * Each active thread holds a counted reference on nn->nfsd_serv, as does * the nn->keep_active flag and various transient calls to svc_get(). diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index fbfe6ea737c8..45aa7648dca6 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -32,10 +32,10 @@ */ struct svc_pool { unsigned int sp_id; /* pool id; also node id on NUMA */ - spinlock_t sp_lock; /* protects all fields */ + spinlock_t sp_lock; /* protects sp_sockets */ struct list_head sp_sockets; /* pending sockets */ unsigned int sp_nrthreads; /* # of threads in pool */ - struct list_head sp_all_threads; /* all server threads */ + struct xarray sp_thread_xa; /* statistics on pool operation */ struct percpu_counter sp_sockets_queued; @@ -194,7 +194,6 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * processed. */ struct svc_rqst { - struct list_head rq_all; /* all threads list */ struct rcu_head rq_rcu_head; /* for RCU deferred kfree */ struct svc_xprt * rq_xprt; /* transport ptr */ @@ -239,10 +238,10 @@ struct svc_rqst { #define RQ_SPLICE_OK (4) /* turned off in gss privacy * to prevent encrypting page * cache pages */ -#define RQ_VICTIM (5) /* about to be shut down */ -#define RQ_BUSY (6) /* request is busy */ -#define RQ_DATA (7) /* request has data */ +#define RQ_BUSY (5) /* request is busy */ +#define RQ_DATA (6) /* request has data */ unsigned long rq_flags; /* flags field */ + u32 rq_thread_id; /* xarray index */ ktime_t rq_qtime; /* enqueue time */ void * rq_argp; /* decoded arguments */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 70f3bc22c429..4ec746048f15 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1600,7 +1600,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto); svc_rqst_flag(USEDEFERRAL) \ svc_rqst_flag(DROPME) \ svc_rqst_flag(SPLICE_OK) \ - svc_rqst_flag(VICTIM) \ svc_rqst_flag(BUSY) \ svc_rqst_flag_end(DATA) @@ -2043,6 +2042,52 @@ TRACE_EVENT(svc_pool_starved, ) ); +DECLARE_EVENT_CLASS(svc_thread_lifetime_class, + TP_PROTO( + const struct svc_serv *serv, + const struct svc_pool *pool, + const struct svc_rqst *rqstp + ), + + TP_ARGS(serv, pool, rqstp), + + TP_STRUCT__entry( + __string(name, serv->sv_name) + __field(int, pool_id) + __field(unsigned int, nrthreads) + __field(unsigned long, pool_flags) + __field(u32, thread_id) + __field(const void *, rqstp) + ), + + TP_fast_assign( + __assign_str(name, serv->sv_name); + __entry->pool_id = pool->sp_id; + __entry->nrthreads = pool->sp_nrthreads; + __entry->pool_flags = pool->sp_flags; + __entry->thread_id = rqstp->rq_thread_id; + __entry->rqstp = rqstp; + ), + + TP_printk("service=%s pool=%d pool_flags=%s nrthreads=%u thread_id=%u", + __get_str(name), __entry->pool_id, + show_svc_pool_flags(__entry->pool_flags), + __entry->nrthreads, __entry->thread_id + ) +); + +#define DEFINE_SVC_THREAD_LIFETIME_EVENT(name) \ + DEFINE_EVENT(svc_thread_lifetime_class, svc_pool_##name, \ + TP_PROTO( \ + const struct svc_serv *serv, \ + const struct svc_pool *pool, \ + const struct svc_rqst *rqstp \ + ), \ + TP_ARGS(serv, pool, rqstp)) + +DEFINE_SVC_THREAD_LIFETIME_EVENT(thread_init); +DEFINE_SVC_THREAD_LIFETIME_EVENT(thread_exit); + DECLARE_EVENT_CLASS(svc_xprt_event, TP_PROTO( const struct svc_xprt *xprt diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 828d28883ea8..18fbb98895ea 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -507,8 +507,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, pool->sp_id = i; INIT_LIST_HEAD(&pool->sp_sockets); - INIT_LIST_HEAD(&pool->sp_all_threads); spin_lock_init(&pool->sp_lock); + xa_init_flags(&pool->sp_thread_xa, XA_FLAGS_ALLOC); percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); @@ -592,6 +592,8 @@ svc_destroy(struct kref *ref) percpu_counter_destroy(&pool->sp_threads_woken); percpu_counter_destroy(&pool->sp_threads_timedout); percpu_counter_destroy(&pool->sp_threads_starved); + + xa_destroy(&pool->sp_thread_xa); } kfree(serv->sv_pools); kfree(serv); @@ -672,7 +674,11 @@ EXPORT_SYMBOL_GPL(svc_rqst_alloc); static struct svc_rqst * svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { + static const struct xa_limit limit = { + .max = UINT_MAX, + }; struct svc_rqst *rqstp; + int ret; rqstp = svc_rqst_alloc(serv, pool, node); if (!rqstp) @@ -683,11 +689,21 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) serv->sv_nrthreads += 1; spin_unlock_bh(&serv->sv_lock); - spin_lock_bh(&pool->sp_lock); + xa_lock(&pool->sp_thread_xa); + ret = __xa_alloc(&pool->sp_thread_xa, &rqstp->rq_thread_id, rqstp, + limit, GFP_KERNEL); + if (ret) { + xa_unlock(&pool->sp_thread_xa); + goto out_free; + } pool->sp_nrthreads++; - list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); - spin_unlock_bh(&pool->sp_lock); + xa_unlock(&pool->sp_thread_xa); + trace_svc_pool_thread_init(serv, pool, rqstp); return rqstp; + +out_free: + svc_rqst_free(rqstp); + return ERR_PTR(ret); } /** @@ -704,19 +720,17 @@ struct svc_rqst *svc_pool_wake_idle_thread(struct svc_serv *serv, struct svc_pool *pool) { struct svc_rqst *rqstp; + unsigned long index; - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { + xa_for_each(&pool->sp_thread_xa, index, rqstp) { if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) continue; - rcu_read_unlock(); WRITE_ONCE(rqstp->rq_qtime, ktime_get()); wake_up_process(rqstp->rq_task); percpu_counter_inc(&pool->sp_threads_woken); return rqstp; } - rcu_read_unlock(); trace_svc_pool_starved(serv, pool); percpu_counter_inc(&pool->sp_threads_starved); @@ -732,32 +746,31 @@ svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) static struct task_struct * svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) { - unsigned int i; struct task_struct *task = NULL; + struct svc_rqst *rqstp; + unsigned long zero = 0; + unsigned int i; if (pool != NULL) { - spin_lock_bh(&pool->sp_lock); + xa_lock(&pool->sp_thread_xa); } else { for (i = 0; i < serv->sv_nrpools; i++) { pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_all_threads)) + xa_lock(&pool->sp_thread_xa); + if (!xa_empty(&pool->sp_thread_xa)) goto found_pool; - spin_unlock_bh(&pool->sp_lock); + xa_unlock(&pool->sp_thread_xa); } return NULL; } found_pool: - if (!list_empty(&pool->sp_all_threads)) { - struct svc_rqst *rqstp; - - rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); - set_bit(RQ_VICTIM, &rqstp->rq_flags); - list_del_rcu(&rqstp->rq_all); + rqstp = xa_find(&pool->sp_thread_xa, &zero, U32_MAX, XA_PRESENT); + if (rqstp) { + __xa_erase(&pool->sp_thread_xa, rqstp->rq_thread_id); task = rqstp->rq_task; } - spin_unlock_bh(&pool->sp_lock); + xa_unlock(&pool->sp_thread_xa); return task; } @@ -839,9 +852,9 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) if (pool == NULL) { nrservs -= serv->sv_nrthreads; } else { - spin_lock_bh(&pool->sp_lock); + xa_lock(&pool->sp_thread_xa); nrservs -= pool->sp_nrthreads; - spin_unlock_bh(&pool->sp_lock); + xa_unlock(&pool->sp_thread_xa); } if (nrservs > 0) @@ -928,11 +941,11 @@ svc_exit_thread(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; struct svc_pool *pool = rqstp->rq_pool; - spin_lock_bh(&pool->sp_lock); + xa_lock(&pool->sp_thread_xa); pool->sp_nrthreads--; - if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags)) - list_del_rcu(&rqstp->rq_all); - spin_unlock_bh(&pool->sp_lock); + __xa_erase(&pool->sp_thread_xa, rqstp->rq_thread_id); + xa_unlock(&pool->sp_thread_xa); + trace_svc_pool_thread_exit(serv, pool, rqstp); spin_lock_bh(&serv->sv_lock); serv->sv_nrthreads -= 1; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 7d5aed4d1766..77fc20b2181d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -46,7 +46,7 @@ static LIST_HEAD(svc_xprt_class_list); /* SMP locking strategy: * - * svc_pool->sp_lock protects most of the fields of that pool. + * svc_pool->sp_lock protects sp_sockets. * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. * when both need to be taken (rare), svc_serv->sv_lock is first. * The "service mutex" protects svc_serv->sv_nrthread.