Hi Julian, Thanks for getting back to me. This is my first patch to the kernel so I really appreciate your patience reviewing it. I will update the patch based on your comments shortly. Regards, Andrew Sy Kim On Tue, May 26, 2020 at 5:25 PM Julian Anastasov <ja@xxxxxx> wrote: > > > Hello, > > Long CC list trimmed... > > On Sun, 24 May 2020, Andrew Sy Kim wrote: > > > If expire_nodest_conn=1 and a destination is deleted, IPVS should > > also expire all matching connections immiediately instead of waiting for > > the next matching packet. This is particulary useful when there are a > > lot of packets coming from a few number of clients. Those clients are > > likely to match against existing entries if a source port in the > > connection hash is reused. When the number of entries in the connection > > tracker is large, we can significantly reduce the number of dropped > > packets by expiring all connections upon deletion in a kthread. > > > > Signed-off-by: Andrew Sy Kim <kim.andrewsy@xxxxxxxxx> > > --- > > include/net/ip_vs.h | 12 +++++++++ > > net/netfilter/ipvs/ip_vs_conn.c | 48 +++++++++++++++++++++++++++++++++ > > net/netfilter/ipvs/ip_vs_core.c | 45 +++++++++++++------------------ > > net/netfilter/ipvs/ip_vs_ctl.c | 16 +++++++++++ > > 4 files changed, 95 insertions(+), 26 deletions(-) > > > > > /* Get reference to gain full access to conn. > > * By default, RCU read-side critical sections have access only to > > * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference. > > diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c > > index 02f2f636798d..111fa0e287a2 100644 > > --- a/net/netfilter/ipvs/ip_vs_conn.c > > +++ b/net/netfilter/ipvs/ip_vs_conn.c > > @@ -1366,6 +1366,54 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) > > goto flush_again; > > } > > } > > + > > +/* Thread function to flush all the connection entries in the > > + * ip_vs_conn_tab with a matching destination. > > + */ > > +int ip_vs_conn_flush_dest(void *data) > > +{ > > + struct ip_vs_conn_flush_dest_tinfo *tinfo = data; > > + struct netns_ipvs *ipvs = tinfo->ipvs; > > + struct ip_vs_dest *dest = tinfo->dest; > > Starting a kthread just for single dest can cause storms > when many dests are used. IMHO, we should work this way: > > - do not use kthreads: they are hard to manage, start only from > process context (we can not retry from timer if creating a kthread > fails for some reason). > > - use delayed_work, similar to our defense_work but this time > we should use queue_delayed_work(system_long_wq,...) instead of > schedule_delayed_work(). Just cancel_delayed_work_sync() is needed > to stop the work. The callback function will not start the > work timer. > > - we will use one work for the netns (struct netns_ipvs *ipvs): > __ip_vs_del_dest() will start it for next jiffie (delay=1) to > catch more dests for flusing. As result, the first destination > will start the work timer, other dests will do nothing while timer > is pending. When timer expires, the work is queued to worker, > so next dests will start the timer again, even while the work > is executing the callback function. > > > + > > + int idx; > > + struct ip_vs_conn *cp, *cp_c; > > + > > + IP_VS_DBG_BUF(4, "flushing all connections with destination %s:%d", > > + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); > > We will not provide current dest. Still, above was not > safe because we do not hold reference to dest. > > > + > > + rcu_read_lock(); > > + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { > > + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { > > + if (cp->ipvs != ipvs) > > + continue; > > + > > + if (cp->dest != dest) > > + continue; > > struct ip_vs_dest *dest = cp->dest; > > if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) > continue; > > We can access dest because under RCU grace period > we have access to the cp->dest fields. But we > should read cp->dest once as done above. > > > + > > + /* As timers are expired in LIFO order, restart > > + * the timer of controlling connection first, so > > + * that it is expired after us. > > + */ > > + cp_c = cp->control; > > + /* cp->control is valid only with reference to cp */ > > + if (cp_c && __ip_vs_conn_get(cp)) { > > + IP_VS_DBG(4, "del controlling connection\n"); > > + ip_vs_conn_expire_now(cp_c); > > + __ip_vs_conn_put(cp); > > + } > > + > > + IP_VS_DBG(4, "del connection\n"); > > + ip_vs_conn_expire_now(cp); > > + } > > + cond_resched_rcu(); > > if (!ipvs->enable) > break; > > Abort immediately if netns cleanup is started. > > > + } > > + rcu_read_unlock(); > > + > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(ip_vs_conn_flush_dest); > > + > > /* > > * per netns init and exit > > */ > > diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c > > index aa6a603a2425..ff052e57e054 100644 > > --- a/net/netfilter/ipvs/ip_vs_core.c > > +++ b/net/netfilter/ipvs/ip_vs_core.c > > @@ -24,6 +24,7 @@ > > > > #include <linux/module.h> > > #include <linux/kernel.h> > > +#include <linux/kthread.h> > > #include <linux/ip.h> > > #include <linux/tcp.h> > > #include <linux/sctp.h> > > @@ -694,11 +695,6 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) > > return ipvs->sysctl_nat_icmp_send; > > } > > > > -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) > > -{ > > - return ipvs->sysctl_expire_nodest_conn; > > -} > > - > > #else > > > > static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } > > @@ -2095,36 +2091,33 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int > > } > > } > > > > - if (unlikely(!cp)) { > > - int v; > > - > > - if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) > > - return v; > > - } > > - > > - IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); > > - > > /* Check the server status */ > > - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { > > + if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { > > /* the destination server is not available */ > > > > - __u32 flags = cp->flags; > > - > > - /* when timer already started, silently drop the packet.*/ > > - if (timer_pending(&cp->timer)) > > - __ip_vs_conn_put(cp); > > - else > > - ip_vs_conn_put(cp); > > + if (sysctl_expire_nodest_conn(ipvs)) { > > + bool uses_ct = ip_vs_conn_uses_conntrack(cp, skb); > > > > - if (sysctl_expire_nodest_conn(ipvs) && > > - !(flags & IP_VS_CONN_F_ONE_PACKET)) { > > - /* try to expire the connection immediately */ > > ip_vs_conn_expire_now(cp); > > + __ip_vs_conn_put(cp); > > + if (uses_ct) > > + return NF_DROP; > > + cp = NULL; > > + } else { > > + __ip_vs_conn_put(cp); > > + return NF_DROP; > > } > > + } > > > > - return NF_DROP; > > + if (unlikely(!cp)) { > > + int v; > > + > > + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) > > + return v; > > } > > > > + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); > > + > > The above code in ip_vs_in() is correct. > > > ip_vs_in_stats(cp, skb); > > ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); > > if (cp->packet_xmit) > > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c > > index 8d14a1acbc37..fa48268368fc 100644 > > --- a/net/netfilter/ipvs/ip_vs_ctl.c > > +++ b/net/netfilter/ipvs/ip_vs_ctl.c > > @@ -1163,6 +1163,22 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, > > list_add(&dest->t_list, &ipvs->dest_trash); > > dest->idle_start = 0; > > spin_unlock_bh(&ipvs->dest_trash_lock); > > + > > + /* If expire_nodest_conn is enabled, expire all connections > > + * immediately in a kthread. > > + */ > > + if (sysctl_expire_nodest_conn(ipvs)) { > > Looks like we should not start work when 'cleanup' is true, it > indicates that we are doing final release of all resources. > > if (sysctl_expire_nodest_conn(ipvs) && !cleanup) > queue_delayed_work(system_long_wq, &ipvs->expire_nodest_work, 1); > > > + struct ip_vs_conn_flush_dest_tinfo *tinfo = NULL; > > + > > + tinfo = kcalloc(1, sizeof(struct ip_vs_conn_flush_dest_tinfo), > > + GFP_KERNEL); > > + tinfo->ipvs = ipvs; > > + tinfo->dest = dest; > > + > > + IP_VS_DBG(3, "flushing connections in kthread\n"); > > + kthread_run(ip_vs_conn_flush_dest, > > + tinfo, "ipvs-flush-dest-conn"); > > + } > > } > > Regards > > -- > Julian Anastasov <ja@xxxxxx>