On Sat, Aug 27, 2022 at 08:41:52PM +0300, Julian Anastasov wrote: >Estimating all entries in single list in timer context >causes large latency with multiple rules. > >Spread the estimator structures in multiple chains and >use kthread(s) for the estimation. Every chain is >processed under RCU lock. If RCU preemption is not >enabled, we add code for rescheduling by delaying >the removal of the currently estimated entry. > >We also add delayed work est_reload_work that will >make sure the kthread tasks are properly started. > >Signed-off-by: Julian Anastasov <ja@xxxxxx> >--- > include/net/ip_vs.h | 84 ++++++- > net/netfilter/ipvs/ip_vs_ctl.c | 55 ++++- > net/netfilter/ipvs/ip_vs_est.c | 403 +++++++++++++++++++++++++++------ > 3 files changed, 468 insertions(+), 74 deletions(-) > >diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h >index bd8ae137e43b..8171d845520c 100644 >--- a/include/net/ip_vs.h >+++ b/include/net/ip_vs.h >@@ -363,9 +363,14 @@ struct ip_vs_cpu_stats { > struct u64_stats_sync syncp; > }; > >+/* resched during estimation, the defines should match cond_resched_rcu */ >+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) >+#define IPVS_EST_RESCHED_RCU 1 >+#endif >+ > /* IPVS statistics objects */ > struct ip_vs_estimator { >- struct list_head list; >+ struct hlist_node list; > > u64 last_inbytes; > u64 last_outbytes; >@@ -378,6 +383,31 @@ struct ip_vs_estimator { > u64 outpps; > u64 inbps; > u64 outbps; >+ >+#ifdef IPVS_EST_RESCHED_RCU >+ refcount_t refcnt; >+#endif >+ u32 ktid:16, /* kthread ID */ >+ ktrow:16; /* row ID for kthread */ >+}; >+ >+/* Spread estimator states in multiple chains */ >+#define IPVS_EST_NCHAINS 50 >+#define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NCHAINS) >+ >+/* Context for estimation kthread */ >+struct ip_vs_est_kt_data { >+ struct netns_ipvs *ipvs; >+ struct task_struct *task; /* task if running */ >+ struct mutex mutex; /* held during resched */ >+ int id; /* ktid per netns */ >+ int est_count; /* attached ests to kthread */ >+ int est_max_count; /* max ests per kthread */ >+ int add_row; /* row for new ests */ >+ int est_row; /* estimated row */ >+ unsigned long est_timer; /* estimation timer (jiffies) */ >+ struct hlist_head chains[IPVS_EST_NCHAINS]; >+ int chain_len[IPVS_EST_NCHAINS]; > }; > > /* >@@ -948,9 +978,13 @@ struct netns_ipvs { > struct ctl_table_header *lblcr_ctl_header; > struct ctl_table *lblcr_ctl_table; > /* ip_vs_est */ >- struct list_head est_list; /* estimator list */ >- spinlock_t est_lock; >- struct timer_list est_timer; /* Estimation timer */ >+ struct delayed_work est_reload_work;/* Reload kthread tasks */ >+ struct mutex est_mutex; /* protect kthread tasks */ >+ struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ >+ int est_kt_count; /* Allocated ptrs */ >+ int est_add_ktid; /* ktid where to add ests */ >+ atomic_t est_genid; /* kthreads reload genid */ >+ atomic_t est_genid_done; /* applied genid */ > /* ip_vs_sync */ > spinlock_t sync_lock; > struct ipvs_master_sync_state *ms; >@@ -1485,6 +1519,48 @@ void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); > void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); > void ip_vs_zero_estimator(struct ip_vs_stats *stats); > void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); >+void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool bump); >+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, >+ struct ip_vs_est_kt_data *kd); >+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); >+ >+extern struct mutex ip_vs_est_mutex; >+ >+static inline void ip_vs_est_init_resched_rcu(struct ip_vs_estimator *e) >+{ >+#ifdef IPVS_EST_RESCHED_RCU >+ refcount_set(&e->refcnt, 1); >+#endif >+} >+ >+static inline void ip_vs_est_cond_resched_rcu(struct ip_vs_est_kt_data *kd, >+ struct ip_vs_estimator *e) >+{ >+#ifdef IPVS_EST_RESCHED_RCU >+ if (mutex_trylock(&kd->mutex)) { >+ /* Block removal during reschedule */ >+ if (refcount_inc_not_zero(&e->refcnt)) { >+ cond_resched_rcu(); >+ refcount_dec(&e->refcnt); >+ } >+ mutex_unlock(&kd->mutex); >+ } >+#endif >+} >+ >+static inline void ip_vs_est_wait_resched(struct netns_ipvs *ipvs, >+ struct ip_vs_estimator *est) >+{ >+#ifdef IPVS_EST_RESCHED_RCU >+ /* Estimator kthread is rescheduling on deleted est? Wait it! */ >+ if (!refcount_dec_and_test(&est->refcnt)) { >+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[est->ktid]; >+ >+ mutex_lock(&kd->mutex); >+ mutex_unlock(&kd->mutex); IIUC, this mutex_lock/unlock() is just used for waiting for the ipvs-e thread schedule back if it had been scheduled out in cond_resched_rcu() ? But not to protect data ? If so, I am wondering if we can remove the mutex_trylock/unlock() in ip_vs_est_cond_resched_rcu, and use some wait/wakeup mechanism to do this ? Because when I run perf on 'ipvs-e' kthreads, I saw lots of CPU cycles are on the mutex_trylock. Thanks >+ } >+#endif >+} > > /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ > int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, >diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c >index 44c79fd1779c..e9f61eba3b8e 100644 >--- a/net/netfilter/ipvs/ip_vs_ctl.c >+++ b/net/netfilter/ipvs/ip_vs_ctl.c >@@ -239,8 +239,49 @@ static void defense_work_handler(struct work_struct *work) > queue_delayed_work(system_long_wq, &ipvs->defense_work, > DEFENSE_TIMER_PERIOD); > } >+ > #endif > >+static void est_reload_work_handler(struct work_struct *work) >+{ >+ struct netns_ipvs *ipvs = >+ container_of(work, struct netns_ipvs, est_reload_work.work); >+ int genid = atomic_read(&ipvs->est_genid); >+ int genid_done = atomic_read(&ipvs->est_genid_done); >+ unsigned long delay = HZ / 10; /* repeat startups after failure */ >+ bool repeat = false; >+ int id; >+ >+ mutex_lock(&ipvs->est_mutex); >+ for (id = 0; id < ipvs->est_kt_count; id++) { >+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; >+ >+ /* netns clean up started, abort delayed work */ >+ if (!ipvs->enable) >+ goto unlock; >+ if (!kd) >+ continue; >+ /* New config ? Stop kthread tasks */ >+ if (genid != genid_done) >+ ip_vs_est_kthread_stop(kd); >+ if (!kd->task && ip_vs_est_kthread_start(ipvs, kd) < 0) >+ repeat = true; >+ } >+ >+ atomic_set(&ipvs->est_genid_done, genid); >+ >+unlock: >+ mutex_unlock(&ipvs->est_mutex); >+ >+ if (!ipvs->enable) >+ return; >+ if (genid != atomic_read(&ipvs->est_genid)) >+ delay = 1; >+ else if (!repeat) >+ return; >+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work, delay); >+} >+ > int > ip_vs_use_count_inc(void) > { >@@ -1421,8 +1462,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, > ip_vs_svc_hash(svc); > > *svc_p = svc; >- /* Now there is a service - full throttle */ >- ipvs->enable = 1; >+ >+ if (!ipvs->enable) { >+ /* Now there is a service - full throttle */ >+ ipvs->enable = 1; >+ >+ /* Start estimation for first time */ >+ ip_vs_est_reload_start(ipvs, true); >+ } >+ > return 0; > > >@@ -4178,6 +4226,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) > atomic_set(&ipvs->nullsvc_counter, 0); > atomic_set(&ipvs->conn_out_counter, 0); > >+ INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); >+ > /* procfs stats */ > ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); > if (!ipvs->tot_stats) >@@ -4235,6 +4285,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) > { > ip_vs_trash_cleanup(ipvs); > ip_vs_control_net_cleanup_sysctl(ipvs); >+ cancel_delayed_work_sync(&ipvs->est_reload_work); > #ifdef CONFIG_PROC_FS > remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); > remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); >diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c >index 9a1a7af6a186..b2dd6f1c284a 100644 >--- a/net/netfilter/ipvs/ip_vs_est.c >+++ b/net/netfilter/ipvs/ip_vs_est.c >@@ -30,9 +30,6 @@ > long interval, it is easy to implement a user level daemon which > periodically reads those statistical counters and measure rate. > >- Currently, the measurement is activated by slow timer handler. Hope >- this measurement will not introduce too much load. >- > We measure rate during the last 8 seconds every 2 seconds: > > avgrate = avgrate*(1-W) + rate*W >@@ -47,68 +44,75 @@ > to 32-bit values for conns, packets, bps, cps and pps. > > * A lot of code is taken from net/core/gen_estimator.c >- */ >- > >-/* >- * Make a summary from each cpu >+ KEY POINTS: >+ - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled >+ - kthreads read the cpustats to update the estimators (svcs, dests, total) >+ - the states of estimators can be read (get stats) or modified (zero stats) >+ from processes >+ >+ KTHREADS: >+ - kthread contexts are created and attached to array >+ - the kthread tasks are created when first service is added, before that >+ the total stats are not estimated >+ - the kthread context holds lists with estimators (chains) which are >+ processed every 2 seconds >+ - as estimators can be added dynamically and in bursts, we try to spread >+ them to multiple chains which are estimated at different time > */ >-static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, >- struct ip_vs_cpu_stats __percpu *stats) >-{ >- int i; >- bool add = false; > >- for_each_possible_cpu(i) { >- struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); >- unsigned int start; >- u64 conns, inpkts, outpkts, inbytes, outbytes; >- >- if (add) { >- do { >- start = u64_stats_fetch_begin(&s->syncp); >- conns = s->cnt.conns; >- inpkts = s->cnt.inpkts; >- outpkts = s->cnt.outpkts; >- inbytes = s->cnt.inbytes; >- outbytes = s->cnt.outbytes; >- } while (u64_stats_fetch_retry(&s->syncp, start)); >- sum->conns += conns; >- sum->inpkts += inpkts; >- sum->outpkts += outpkts; >- sum->inbytes += inbytes; >- sum->outbytes += outbytes; >- } else { >- add = true; >- do { >- start = u64_stats_fetch_begin(&s->syncp); >- sum->conns = s->cnt.conns; >- sum->inpkts = s->cnt.inpkts; >- sum->outpkts = s->cnt.outpkts; >- sum->inbytes = s->cnt.inbytes; >- sum->outbytes = s->cnt.outbytes; >- } while (u64_stats_fetch_retry(&s->syncp, start)); >- } >- } >-} >+/* Optimal chain length used to spread bursts of newly added ests */ >+#define IPVS_EST_BURST_LEN BIT(6) >+/* Max number of ests per kthread (recommended) */ >+#define IPVS_EST_MAX_COUNT (32 * 1024) > >+static struct lock_class_key __ipvs_est_key; > >-static void estimation_timer(struct timer_list *t) >+static void ip_vs_estimation_chain(struct ip_vs_est_kt_data *kd, int row) > { >+ struct hlist_head *chain = &kd->chains[row]; > struct ip_vs_estimator *e; >+ struct ip_vs_cpu_stats *c; > struct ip_vs_stats *s; > u64 rate; >- struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); > >- if (!sysctl_run_estimation(ipvs)) >- goto skip; >+ rcu_read_lock(); >+ hlist_for_each_entry_rcu(e, chain, list) { >+ u64 conns, inpkts, outpkts, inbytes, outbytes; >+ u64 kconns = 0, kinpkts = 0, koutpkts = 0; >+ u64 kinbytes = 0, koutbytes = 0; >+ unsigned int start; >+ int i; >+ >+ if (kthread_should_stop()) >+ break; >+ ip_vs_est_cond_resched_rcu(kd, e); > >- spin_lock(&ipvs->est_lock); >- list_for_each_entry(e, &ipvs->est_list, list) { > s = container_of(e, struct ip_vs_stats, est); >+ for_each_possible_cpu(i) { >+ c = per_cpu_ptr(s->cpustats, i); >+ do { >+ start = u64_stats_fetch_begin(&c->syncp); >+ conns = c->cnt.conns; >+ inpkts = c->cnt.inpkts; >+ outpkts = c->cnt.outpkts; >+ inbytes = c->cnt.inbytes; >+ outbytes = c->cnt.outbytes; >+ } while (u64_stats_fetch_retry(&c->syncp, start)); >+ kconns += conns; >+ kinpkts += inpkts; >+ koutpkts += outpkts; >+ kinbytes += inbytes; >+ koutbytes += outbytes; >+ } >+ >+ spin_lock_bh(&s->lock); > >- spin_lock(&s->lock); >- ip_vs_read_cpu_stats(&s->kstats, s->cpustats); >+ s->kstats.conns = kconns; >+ s->kstats.inpkts = kinpkts; >+ s->kstats.outpkts = koutpkts; >+ s->kstats.inbytes = kinbytes; >+ s->kstats.outbytes = koutbytes; > > /* scaled by 2^10, but divided 2 seconds */ > rate = (s->kstats.conns - e->last_conns) << 9; >@@ -131,32 +135,288 @@ static void estimation_timer(struct timer_list *t) > rate = (s->kstats.outbytes - e->last_outbytes) << 4; > e->last_outbytes = s->kstats.outbytes; > e->outbps += ((s64)rate - (s64)e->outbps) >> 2; >- spin_unlock(&s->lock); >+ spin_unlock_bh(&s->lock); >+ } >+ rcu_read_unlock(); >+} >+ >+static int ip_vs_estimation_kthread(void *data) >+{ >+ struct ip_vs_est_kt_data *kd = data; >+ struct netns_ipvs *ipvs = kd->ipvs; >+ int row = kd->est_row; >+ unsigned long now; >+ long gap; >+ >+ while (1) { >+ set_current_state(TASK_IDLE); >+ if (kthread_should_stop()) >+ break; >+ >+ /* before estimation, check if we should sleep */ >+ now = READ_ONCE(jiffies); >+ gap = kd->est_timer - now; >+ if (gap > 0) { >+ if (gap > IPVS_EST_TICK) { >+ kd->est_timer = now - IPVS_EST_TICK; >+ gap = IPVS_EST_TICK; >+ } >+ schedule_timeout(gap); >+ } else { >+ __set_current_state(TASK_RUNNING); >+ if (gap < -8 * IPVS_EST_TICK) >+ kd->est_timer = now; >+ } >+ >+ if (sysctl_run_estimation(ipvs) && >+ !hlist_empty(&kd->chains[row])) >+ ip_vs_estimation_chain(kd, row); >+ >+ row++; >+ if (row >= IPVS_EST_NCHAINS) >+ row = 0; >+ kd->est_row = row; >+ /* add_row best to point after the just estimated row */ >+ WRITE_ONCE(kd->add_row, row); >+ kd->est_timer += IPVS_EST_TICK; >+ } >+ __set_current_state(TASK_RUNNING); >+ >+ return 0; >+} >+ >+/* Stop (bump=true)/start kthread tasks */ >+void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool bump) >+{ >+ /* Ignore reloads before first service is added */ >+ if (!ipvs->enable) >+ return; >+ /* Bump the kthread configuration genid */ >+ if (bump) >+ atomic_inc(&ipvs->est_genid); >+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work, >+ bump ? 0 : 1); >+} >+ >+/* Start kthread task with current configuration */ >+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, >+ struct ip_vs_est_kt_data *kd) >+{ >+ unsigned long now; >+ int ret = 0; >+ long gap; >+ >+ lockdep_assert_held(&ipvs->est_mutex); >+ >+ if (kd->task) >+ goto out; >+ now = READ_ONCE(jiffies); >+ gap = kd->est_timer - now; >+ /* Sync est_timer if task is starting later */ >+ if (abs(gap) > 4 * IPVS_EST_TICK) >+ kd->est_timer = now; >+ kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", >+ ipvs->gen, kd->id); >+ if (IS_ERR(kd->task)) { >+ ret = PTR_ERR(kd->task); >+ kd->task = NULL; >+ goto out; > } >- spin_unlock(&ipvs->est_lock); > >-skip: >- mod_timer(&ipvs->est_timer, jiffies + 2*HZ); >+ pr_info("starting estimator thread %d...\n", kd->id); >+ wake_up_process(kd->task); >+ >+out: >+ return ret; >+} >+ >+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) >+{ >+ if (kd->task) { >+ pr_info("stopping estimator thread %d...\n", kd->id); >+ kthread_stop(kd->task); >+ kd->task = NULL; >+ } > } > >+/* Create and start estimation kthread in a free or new array slot */ >+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) >+{ >+ struct ip_vs_est_kt_data *kd = NULL; >+ int id = ipvs->est_kt_count; >+ int err = -ENOMEM; >+ void *arr = NULL; >+ int i; >+ >+ mutex_lock(&ipvs->est_mutex); >+ >+ for (i = 0; i < id; i++) { >+ if (!ipvs->est_kt_arr[i]) >+ break; >+ } >+ if (i >= id) { >+ arr = krealloc_array(ipvs->est_kt_arr, id + 1, >+ sizeof(struct ip_vs_est_kt_data *), >+ GFP_KERNEL); >+ if (!arr) >+ goto out; >+ ipvs->est_kt_arr = arr; >+ } else { >+ id = i; >+ } >+ kd = kmalloc(sizeof(*kd), GFP_KERNEL); >+ if (!kd) >+ goto out; >+ kd->ipvs = ipvs; >+ mutex_init(&kd->mutex); >+ kd->id = id; >+ kd->est_count = 0; >+ kd->est_max_count = IPVS_EST_MAX_COUNT; >+ kd->add_row = 0; >+ kd->est_row = 0; >+ kd->est_timer = jiffies; >+ for (i = 0; i < ARRAY_SIZE(kd->chains); i++) >+ INIT_HLIST_HEAD(&kd->chains[i]); >+ memset(kd->chain_len, 0, sizeof(kd->chain_len)); >+ kd->task = NULL; >+ /* Start kthread tasks only when services are present */ >+ if (ipvs->enable) { >+ /* On failure, try to start the task again later */ >+ if (ip_vs_est_kthread_start(ipvs, kd) < 0) >+ ip_vs_est_reload_start(ipvs, false); >+ } >+ >+ if (arr) >+ ipvs->est_kt_count++; >+ ipvs->est_kt_arr[id] = kd; >+ /* Use most recent kthread for new ests */ >+ ipvs->est_add_ktid = id; >+ >+ mutex_unlock(&ipvs->est_mutex); >+ >+ return 0; >+ >+out: >+ mutex_unlock(&ipvs->est_mutex); >+ if (kd) { >+ mutex_destroy(&kd->mutex); >+ kfree(kd); >+ } >+ return err; >+} >+ >+/* Add estimator to current kthread (est_add_ktid) */ > void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) > { > struct ip_vs_estimator *est = &stats->est; >+ struct ip_vs_est_kt_data *kd = NULL; >+ int ktid, row; >+ >+ INIT_HLIST_NODE(&est->list); >+ ip_vs_est_init_resched_rcu(est); >+ >+ if (ipvs->est_add_ktid < ipvs->est_kt_count) { >+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; >+ if (!kd) >+ goto add_kt; >+ if (kd->est_count < kd->est_max_count) >+ goto add_est; >+ } > >- INIT_LIST_HEAD(&est->list); >+add_kt: >+ /* Create new kthread but we can exceed est_max_count on failure */ >+ if (ip_vs_est_add_kthread(ipvs) < 0) { >+ if (!kd || kd->est_count >= INT_MAX / 2) >+ goto out; >+ } >+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; >+ if (!kd) >+ goto out; >+ >+add_est: >+ ktid = kd->id; >+ /* add_row points after the row we should use */ >+ row = READ_ONCE(kd->add_row) - 1; >+ if (row < 0) >+ row = IPVS_EST_NCHAINS - 1; >+ >+ kd->est_count++; >+ kd->chain_len[row]++; >+ /* Multiple ests added together? Fill chains one by one. */ >+ if (!(kd->chain_len[row] & (IPVS_EST_BURST_LEN - 1))) >+ kd->add_row = row; >+ est->ktid = ktid; >+ est->ktrow = row; >+ hlist_add_head_rcu(&est->list, &kd->chains[row]); >+ >+out: >+ ; >+} > >- spin_lock_bh(&ipvs->est_lock); >- list_add(&est->list, &ipvs->est_list); >- spin_unlock_bh(&ipvs->est_lock); >+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) >+{ >+ if (kd) { >+ if (kd->task) >+ kthread_stop(kd->task); >+ mutex_destroy(&kd->mutex); >+ kfree(kd); >+ } > } > >+/* Unlink estimator from list */ > void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) > { > struct ip_vs_estimator *est = &stats->est; >+ struct ip_vs_est_kt_data *kd; >+ int ktid = est->ktid; >+ >+ /* Failed to add to chain ? */ >+ if (hlist_unhashed(&est->list)) >+ goto out; >+ >+ hlist_del_rcu(&est->list); >+ ip_vs_est_wait_resched(ipvs, est); >+ >+ kd = ipvs->est_kt_arr[ktid]; >+ kd->chain_len[est->ktrow]--; >+ kd->est_count--; >+ if (kd->est_count) >+ goto out; >+ pr_info("stop unused estimator thread %d...\n", ktid); >+ >+ mutex_lock(&ipvs->est_mutex); >+ >+ ip_vs_est_kthread_destroy(kd); >+ ipvs->est_kt_arr[ktid] = NULL; >+ if (ktid == ipvs->est_kt_count - 1) >+ ipvs->est_kt_count--; >+ >+ mutex_unlock(&ipvs->est_mutex); >+ >+ if (ktid == ipvs->est_add_ktid) { >+ int count = ipvs->est_kt_count; >+ int best = -1; >+ >+ while (count-- > 0) { >+ if (!ipvs->est_add_ktid) >+ ipvs->est_add_ktid = ipvs->est_kt_count; >+ ipvs->est_add_ktid--; >+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; >+ if (!kd) >+ continue; >+ if (kd->est_count < kd->est_max_count) { >+ best = ipvs->est_add_ktid; >+ break; >+ } >+ if (best < 0) >+ best = ipvs->est_add_ktid; >+ } >+ if (best >= 0) >+ ipvs->est_add_ktid = best; >+ } > >- spin_lock_bh(&ipvs->est_lock); >- list_del(&est->list); >- spin_unlock_bh(&ipvs->est_lock); >+out: >+ ; > } > > void ip_vs_zero_estimator(struct ip_vs_stats *stats) >@@ -191,14 +451,21 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) > > int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) > { >- INIT_LIST_HEAD(&ipvs->est_list); >- spin_lock_init(&ipvs->est_lock); >- timer_setup(&ipvs->est_timer, estimation_timer, 0); >- mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); >+ ipvs->est_kt_arr = NULL; >+ ipvs->est_kt_count = 0; >+ ipvs->est_add_ktid = 0; >+ atomic_set(&ipvs->est_genid, 0); >+ atomic_set(&ipvs->est_genid_done, 0); >+ __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); > return 0; > } > > void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) > { >- del_timer_sync(&ipvs->est_timer); >+ int i; >+ >+ for (i = 0; i < ipvs->est_kt_count; i++) >+ ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); >+ kfree(ipvs->est_kt_arr); >+ mutex_destroy(&ipvs->est_mutex); > } >-- >2.37.2