Hello, On Fri, 19 Aug 2022, longguang.yue wrote: > Netfilter's rules are matched in sequence, more rules worse performance. > IPVS is a special system, its traffic is clear and definite, for better > performance, should better not be interfered heavily. This patch adds a > sysctl switch and enable ipvs to control traffic to pass netfilter > OUTPUT chain or not. > > Signed-off-by: longguang.yue <bigclouds@xxxxxxx> > --- > include/net/ip_vs.h | 11 +++++++++++ > net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++++ > net/netfilter/ipvs/ip_vs_xmit.c | 31 ++++++++++++++++++++++++------- > 3 files changed, 43 insertions(+), 7 deletions(-) > > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h > index ff1804a0c469..c1232ef3a1b5 100644 > --- a/include/net/ip_vs.h > +++ b/include/net/ip_vs.h > @@ -932,6 +932,7 @@ struct netns_ipvs { > int sysctl_schedule_icmp; > int sysctl_ignore_tunneled; > int sysctl_run_estimation; > + int sysctl_output_bypass; > > /* ip_vs_lblc */ > int sysctl_lblc_expiration; > @@ -1077,6 +1078,11 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) > return ipvs->sysctl_run_estimation; > } > > +static inline int sysctl_output_bypass(struct netns_ipvs *ipvs) > +{ > + return ipvs->sysctl_output_bypass; > +} > + > #else > > static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) > @@ -1174,6 +1180,11 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) > return 1; > } > > +static inline int sysctl_output_bypass(struct netns_ipvs *ipvs) > +{ > + return 0; > +} > + > #endif > > /* IPVS core functions > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c > index efab2b06d373..8a08a783e85e 100644 > --- a/net/netfilter/ipvs/ip_vs_ctl.c > +++ b/net/netfilter/ipvs/ip_vs_ctl.c > @@ -2019,6 +2019,12 @@ static struct ctl_table vs_vars[] = { > .mode = 0644, > .proc_handler = proc_dointvec, > }, > + { > + .procname = "output_bypass", > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > #ifdef CONFIG_IP_VS_DEBUG > { > .procname = "debug_level", > @@ -4094,6 +4100,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) > tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; > ipvs->sysctl_run_estimation = 1; > tbl[idx++].data = &ipvs->sysctl_run_estimation; > + ipvs->sysctl_output_bypass = 1; Default should be 0, so without above line. > + tbl[idx++].data = &ipvs->sysctl_output_bypass; > #ifdef CONFIG_IP_VS_DEBUG > /* Global sysctls must be ro in non-init netns */ > if (!net_eq(net, &init_net)) > diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c > index 029171379884..46a34dd2555e 100644 > --- a/net/netfilter/ipvs/ip_vs_xmit.c > +++ b/net/netfilter/ipvs/ip_vs_xmit.c > @@ -653,8 +653,12 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, > skb_forward_csum(skb); > if (skb->dev) > skb_clear_tstamp(skb); > - NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, > - NULL, skb_dst(skb)->dev, dst_output); > + if (sysctl_output_bypass(cp->ipvs)) { > + dst_output(cp->ipvs->net, NULL, skb); > + } else { > + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, > + NULL, skb_dst(skb)->dev, dst_output); > + } > } else > ret = NF_ACCEPT; > > @@ -675,8 +679,12 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, > skb_forward_csum(skb); > if (skb->dev) > skb_clear_tstamp(skb); > - NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, > - NULL, skb_dst(skb)->dev, dst_output); > + if (sysctl_output_bypass(cp->ipvs)) { > + dst_output(cp->ipvs->net, NULL, skb); > + } else { > + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, > + NULL, skb_dst(skb)->dev, dst_output); > + } > } else > ret = NF_ACCEPT; > return ret; > @@ -1262,10 +1270,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, > skb->ignore_df = 1; > > ret = ip_vs_tunnel_xmit_prepare(skb, cp); > - if (ret == NF_ACCEPT) > - ip_local_out(net, skb->sk, skb); > - else if (ret == NF_DROP) > + if (ret == NF_ACCEPT) { > + if (sysctl_output_bypass(cp->ipvs)) { > + struct iphdr *iph = ip_hdr(skb); > + > + iph->tot_len = htons(skb->len); > + ip_send_check(iph); > + dst_output(cp->ipvs->net, NULL, skb); After our last discussion I still think this is a hack. If you expand also __ip6_local_out you'll see what maintenance cost adds such flag. Your patch also warns how much ip_local_out changed in the years and our NF_HOOK calls look as hacks too. They were used to avoid duplicate calls like ip_send_check which we should not have anymore, ip_send_check was added to the ip_defrag flow in 2015 with commit 0848f6428ba3. So, I think we should be moving to ip_local_out/ip6_local_out calls which are risky to duplicate here in the IPVS code. > + } else { > + ip_local_out(net, skb->sk, skb); > + } > + } else if (ret == NF_DROP) { > kfree_skb(skb); > + } > > LeaveFunction(10); Regards -- Julian Anastasov <ja@xxxxxx>