Hello, On Wed, 18 Aug 2021, Dust Li wrote: > fix #31562403 > > estimation_timer will iterater the est_list to do estimation > for each ipvs stats. When there are lots of services, the > list can be very large. > We observiced estimation_timer() run for more then 200ms on > a machine with 104 CPU and 50K services. > > yunhong-cgl jiang report the same phenomenon before: > https://www.spinics.net/lists/lvs-devel/msg05426.html > > In some cases(for example a large K8S cluster with many ipvs services), > ipvs estimation may not be needed. So adding a sysctl blob to allow > users to disable this completely. > > Default is: 1 (enable) I guess, this patch should target net-next tree. > Signed-off-by: Dust Li <dust.li@xxxxxxxxxxxxxxxxx> > --- > Documentation/networking/ipvs-sysctl.rst | 17 ++++++ > include/net/ip_vs.h | 3 + > net/netfilter/ipvs/ip_vs_est.c | 73 ++++++++++++++++++++++++ > 3 files changed, 93 insertions(+) > > diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst > index 2afccc63856e..e20f7a27fc85 100644 > --- a/Documentation/networking/ipvs-sysctl.rst > +++ b/Documentation/networking/ipvs-sysctl.rst > @@ -300,3 +300,20 @@ sync_version - INTEGER > > Kernels with this sync_version entry are able to receive messages > of both version 1 and version 2 of the synchronisation protocol. > + > +run_estimation - BOOLEAN > + 0 - disabled > + not 0 - enabled (default) > + > + If disabled, the estimation will be stop, and you can't see > + any update on speed estimation data. > + > + For example > + 'Conns/s Pkts/s Pkts/s Bytes/s Bytes/s' > + those data in /proc/net/ip_vs_stats will always be zero. > + Note, this only affect the speed estimation, the total data > + will still be updated. > + > + You can always re-enable estimation by setting this value to 1. > + But be carefull, the first estimation after re-enable is not > + accurate. > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h > index d609e957a3ec..6cbb3a08b176 100644 > --- a/include/net/ip_vs.h > +++ b/include/net/ip_vs.h > @@ -941,6 +941,9 @@ struct netns_ipvs { > struct ctl_table_header *lblcr_ctl_header; > struct ctl_table *lblcr_ctl_table; > /* ip_vs_est */ > + int sysctl_run_estimation; We should keep such sysctl vars in a CONFIG_SYSCTL block and we have corresponding inline functions for the two cases depending on CONFIG_SYSCTL. Still, many of the sysctl_* fields are not moved into such ifdef. But you can get sysctl_pmtu_disc as example: # grep -r sysctl_pmtu_disc include/net/ip_vs.h net/netfilter/ipvs/ Add the var in ip_vs_ctl.c as the other vars and implement it just like sysctl_pmtu_disc. > + struct ctl_table_header *est_ctl_header; > + struct ctl_table *est_ctl_table; As result, we will not need specific ctl table > struct list_head est_list; /* estimator list */ > spinlock_t est_lock; > struct timer_list est_timer; /* Estimation timer */ > diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c > index 05b8112ffb37..6709796b8621 100644 > --- a/net/netfilter/ipvs/ip_vs_est.c > +++ b/net/netfilter/ipvs/ip_vs_est.c > @@ -100,6 +100,9 @@ static void estimation_timer(struct timer_list *t) > u64 rate; > struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); > > + if (!ipvs->sysctl_run_estimation) > + goto skip; Some day this var will not be present if !CONFIG_SYSCTL, so we should call the new inline function: if (!sysctl_run_estimation(ipvs)) goto skip; > + > spin_lock(&ipvs->est_lock); > list_for_each_entry(e, &ipvs->est_list, list) { > s = container_of(e, struct ip_vs_stats, est); > @@ -131,6 +134,8 @@ static void estimation_timer(struct timer_list *t) > spin_unlock(&s->lock); > } > spin_unlock(&ipvs->est_lock); > + > +skip: > mod_timer(&ipvs->est_timer, jiffies + 2*HZ); > } > > @@ -184,10 +189,77 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) > dst->outbps = (e->outbps + 0xF) >> 5; > } > > +#ifdef CONFIG_SYSCTL > +/* IPVS ESTIMATION sysctl table */ > +static struct ctl_table vs_vars_table[] = { > + { > + .procname = "run_estimation", > + .data = NULL, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + { } > +}; > + > +static int ip_vs_est_sysctl_init(struct netns_ipvs *ipvs) > +{ > + struct net *net = ipvs->net; > + > + if (!net_eq(net, &init_net)) { > + ipvs->est_ctl_table = kmemdup(vs_vars_table, > + sizeof(vs_vars_table), > + GFP_KERNEL); > + if (!ipvs->est_ctl_table) > + return -ENOMEM; > + > + /* Don't export sysctls to unprivileged users */ > + if (net->user_ns != &init_user_ns) > + ipvs->est_ctl_table[0].procname = NULL; > + > + } else { > + ipvs->est_ctl_table = vs_vars_table; > + } > + > + ipvs->sysctl_run_estimation = 1; > + ipvs->est_ctl_table[0].data = &ipvs->sysctl_run_estimation; > + > + ipvs->est_ctl_header = > + register_net_sysctl(net, "net/ipv4/vs", ipvs->est_ctl_table); > + if (!ipvs->est_ctl_header) { > + if (!net_eq(net, &init_net)) > + kfree(ipvs->est_ctl_table); > + return -ENOMEM; > + } > + > + return 0; > +} > + > +static void ip_vs_est_sysctl_cleanup(struct netns_ipvs *ipvs) > +{ > + unregister_net_sysctl_table(ipvs->est_ctl_header); > + > + if (!net_eq(ipvs->net, &init_net)) > + kfree(ipvs->est_ctl_table); > +} > + > +#else > + > +static int ip_vs_est_sysctl_init(struct netns_ipvs *ipvs) > +{ > + ipvs->sysctl_run_estimation = 1; > + return 0; > +} > + > +static void ip_vs_est_sysctl_cleanup(struct netns_ipvs *ipvs) { } > + > +#endif Drop this ctl table. Just reset sysctl_run_estimation in ip_vs_control_net_init_sysctl(). > + > int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) > { > INIT_LIST_HEAD(&ipvs->est_list); > spin_lock_init(&ipvs->est_lock); > + ip_vs_est_sysctl_init(ipvs); > timer_setup(&ipvs->est_timer, estimation_timer, 0); > mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); > return 0; > @@ -196,4 +268,5 @@ int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) > void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) > { > del_timer_sync(&ipvs->est_timer); > + ip_vs_est_sysctl_cleanup(ipvs); > } > -- > 2.19.1.3.ge56e4f7 Regards -- Julian Anastasov <ja@xxxxxx>