On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote: > A future patch will allow the number of threads in each nfsd pool to > vary dynamically. > The lower bound will be the number explicit requested via > /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads > > The upper bound can be set in each net-namespace by writing > /proc/fs/nfsd/max_threads. This upper bound applies across all pools, > there is no per-pool upper limit. > > If no upper bound is set, then one is calculated. A global upper limit > is chosen based on amount of memory. This limit only affects dynamic > changes. Static configuration can always over-ride it. > > We track how many threads are configured in each net namespace, with the > max or the min. We also track how many net namespaces have nfsd > configured with only a min, not a max. > > The difference between the calculated max and the total allocation is > available to be shared among those namespaces which don't have a maximum > configured. Within a namespace, the available share is distributed > equally across all pools. > > In the common case there is one namespace and one pool. A small number > of threads are configured as a minimum and no maximum is set. In this > case the effective maximum will be directly based on total memory. > Approximately 8 per gigabyte. > Some of this may come across as bikeshedding, but I'd probably prefer that this work a bit differently: 1/ I don't think we should enable this universally -- at least not initially. What I'd prefer to see is a new pool_mode for the dynamic threadpools (maybe call it "dynamic"). That gives us a clear opt-in mechanism. Later once we're convinced it's safe, we can make "dynamic" the default instead of "global". 2/ Rather than specifying a max_threads value separately, why not allow the old threads/pool_threads interface to set the max and just have a reasonable minimum setting (like the current default of 8). Since we're growing the threadpool dynamically, I don't see why we need to have a real configurable minimum. 3/ the dynamic pool-mode should probably be layered on top of the pernode pool mode. IOW, in a NUMA configuration, we should split the threads across NUMA nodes. > Signed-off-by: NeilBrown <neilb@xxxxxxx> > --- > fs/nfsd/netns.h | 6 +++++ > fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++ > fs/nfsd/nfsd.h | 4 ++++ > fs/nfsd/nfssvc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ > fs/nfsd/trace.h | 19 +++++++++++++++ > 5 files changed, 135 insertions(+) > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > index 0d2ac15a5003..329484696a42 100644 > --- a/fs/nfsd/netns.h > +++ b/fs/nfsd/netns.h > @@ -133,6 +133,12 @@ struct nfsd_net { > */ > unsigned int max_connections; > > + /* > + * Maximum number of threads to auto-adjust up to. If 0 then a > + * share of nfsd_max_threads will be used. > + */ > + unsigned int max_threads; > + > u32 clientid_base; > u32 clientid_counter; > u32 clverifier_counter; > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > index d85b6d1fa31f..37e9936567e9 100644 > --- a/fs/nfsd/nfsctl.c > +++ b/fs/nfsd/nfsctl.c > @@ -48,6 +48,7 @@ enum { > NFSD_Ports, > NFSD_MaxBlkSize, > NFSD_MaxConnections, > + NFSD_MaxThreads, > NFSD_Filecache, > NFSD_Leasetime, > NFSD_Gracetime, > @@ -68,6 +69,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size); > static ssize_t write_ports(struct file *file, char *buf, size_t size); > static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); > static ssize_t write_maxconn(struct file *file, char *buf, size_t size); > +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size); > #ifdef CONFIG_NFSD_V4 > static ssize_t write_leasetime(struct file *file, char *buf, size_t size); > static ssize_t write_gracetime(struct file *file, char *buf, size_t size); > @@ -87,6 +89,7 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = { > [NFSD_Ports] = write_ports, > [NFSD_MaxBlkSize] = write_maxblksize, > [NFSD_MaxConnections] = write_maxconn, > + [NFSD_MaxThreads] = write_maxthreads, > #ifdef CONFIG_NFSD_V4 > [NFSD_Leasetime] = write_leasetime, > [NFSD_Gracetime] = write_gracetime, > @@ -939,6 +942,47 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) > return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); > } > > +/* > + * write_maxthreads - Set or report the current max number threads > + * > + * Input: > + * buf: ignored > + * size: zero > + * OR > + * > + * Input: > + * buf: C string containing an unsigned > + * integer value representing the new > + * max number of threads > + * size: non-zero length of C string in @buf > + * Output: > + * On success: passed-in buffer filled with '\n'-terminated C string > + * containing numeric value of max_threads setting > + * for this net namespace; > + * return code is the size in bytes of the string > + * On error: return code is zero or a negative errno value > + */ > +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size) > +{ > + char *mesg = buf; > + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); > + unsigned int maxthreads = nn->max_threads; > + > + if (size > 0) { > + int rv = get_uint(&mesg, &maxthreads); > + > + if (rv) > + return rv; > + trace_nfsd_ctl_maxthreads(netns(file), maxthreads); > + mutex_lock(&nfsd_mutex); > + nn->max_threads = maxthreads; > + nfsd_update_nets(); > + mutex_unlock(&nfsd_mutex); > + } > + > + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxthreads); > +} > + > #ifdef CONFIG_NFSD_V4 > static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, > time64_t *time, struct nfsd_net *nn) > @@ -1372,6 +1416,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) > [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, > [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, > [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, > + [NFSD_MaxThreads] = {"max_threads", &transaction_ops, S_IWUSR|S_IRUGO}, > [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, > #ifdef CONFIG_NFSD_V4 > [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > index e4c643255dc9..6874c2de670b 100644 > --- a/fs/nfsd/nfsd.h > +++ b/fs/nfsd/nfsd.h > @@ -156,6 +156,10 @@ int nfsd_create_serv(struct net *net); > void nfsd_destroy_serv(struct net *net); > > extern int nfsd_max_blksize; > +void nfsd_update_nets(void); > +extern unsigned int nfsd_max_threads; > +extern unsigned long nfsd_net_used; > +extern unsigned int nfsd_net_cnt; > > static inline int nfsd_v4client(struct svc_rqst *rq) > { > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > index b005b2e2e6ad..75d78c17756f 100644 > --- a/fs/nfsd/nfssvc.c > +++ b/fs/nfsd/nfssvc.c > @@ -80,6 +80,21 @@ DEFINE_SPINLOCK(nfsd_drc_lock); > unsigned long nfsd_drc_max_mem; > unsigned long nfsd_drc_slotsize_sum; > > +/* > + * nfsd_max_threads is auto-configured based on available ram. > + * Each network namespace can configure a minimum number of threads > + * and optionally a maximum. > + * nfsd_net_used is the number of the max or min from each net namespace. > + * nfsd_new_cnt is the number of net namespaces with a configured minimum > + * but no configured maximum. > + * When nfsd_max_threads exceeds nfsd_net_used, the different is divided > + * by nfsd_net_cnt and this number gives the excess above the configured minimum > + * for all net namespaces without a configured maximum. > + */ > +unsigned int nfsd_max_threads; > +unsigned long nfsd_net_used; > +unsigned int nfsd_net_cnt; > + > #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) > static const struct svc_version *nfsd_acl_version[] = { > # if defined(CONFIG_NFSD_V2_ACL) > @@ -130,6 +145,47 @@ struct svc_program nfsd_program = { > .pg_rpcbind_set = nfsd_rpcbind_set, > }; > > +void nfsd_update_nets(void) > +{ > + struct net *net; > + > + if (nfsd_max_threads == 0) { > + nfsd_max_threads = (nr_free_buffer_pages() >> 7) / > + (NFSSVC_MAXBLKSIZE >> PAGE_SHIFT); > + } > + nfsd_net_used = 0; > + nfsd_net_cnt = 0; > + down_read(&net_rwsem); > + for_each_net(net) { > + struct nfsd_net *nn = net_generic(net, nfsd_net_id); > + > + if (!nn->nfsd_serv) > + continue; > + if (nn->max_threads > 0) { > + nfsd_net_used += nn->max_threads; > + } else { > + nfsd_net_used += nn->nfsd_serv->sv_nrthreads; > + nfsd_net_cnt += 1; > + } > + } > + up_read(&net_rwsem); > +} > + > +static inline int nfsd_max_pool_threads(struct svc_pool *p, struct nfsd_net *nn) > +{ > + int svthreads = nn->nfsd_serv->sv_nrthreads; > + > + if (nn->max_threads > 0) > + return nn->max_threads; > + if (nfsd_net_cnt == 0 || svthreads == 0) > + return 0; > + if (nfsd_max_threads < nfsd_net_cnt) > + return p->sp_nrthreads; > + /* Share nfsd_max_threads among all net, then among pools in this net. */ > + return p->sp_nrthreads + > + nfsd_max_threads / nfsd_net_cnt * p->sp_nrthreads / svthreads; > +} > + > bool nfsd_support_version(int vers) > { > if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) > @@ -474,6 +530,7 @@ void nfsd_destroy_serv(struct net *net) > spin_lock(&nfsd_notifier_lock); > nn->nfsd_serv = NULL; > spin_unlock(&nfsd_notifier_lock); > + nfsd_update_nets(); > > /* check if the notifier still has clients */ > if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { > @@ -614,6 +671,8 @@ int nfsd_create_serv(struct net *net) > nn->nfsd_serv = serv; > spin_unlock(&nfsd_notifier_lock); > > + nfsd_update_nets(); > + > set_max_drc(); > /* check if the notifier is already set */ > if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { > @@ -720,6 +779,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) > goto out; > } > out: > + nfsd_update_nets(); > return err; > } > > @@ -759,6 +819,7 @@ nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const c > error = nfsd_set_nrthreads(n, nthreads, net); > if (error) > goto out_put; > + nfsd_update_nets(); > error = serv->sv_nrthreads; > out_put: > if (serv->sv_nrthreads == 0) > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h > index 77bbd23aa150..92b888e178e8 100644 > --- a/fs/nfsd/trace.h > +++ b/fs/nfsd/trace.h > @@ -2054,6 +2054,25 @@ TRACE_EVENT(nfsd_ctl_maxconn, > ) > ); > > +TRACE_EVENT(nfsd_ctl_maxthreads, > + TP_PROTO( > + const struct net *net, > + int maxthreads > + ), > + TP_ARGS(net, maxthreads), > + TP_STRUCT__entry( > + __field(unsigned int, netns_ino) > + __field(int, maxthreads) > + ), > + TP_fast_assign( > + __entry->netns_ino = net->ns.inum; > + __entry->maxthreads = maxthreads > + ), > + TP_printk("maxthreads=%d", > + __entry->maxthreads > + ) > +); > + > TRACE_EVENT(nfsd_ctl_time, > TP_PROTO( > const struct net *net, -- Jeff Layton <jlayton@xxxxxxxxxx>