On Mon, Feb 23, 2015 at 10:07:21PM +0200, Julian Anastasov wrote: > > Hello, > > On Mon, 23 Feb 2015, Marcelo Ricardo Leitner wrote: > > > Currently, when TCP/SCTP port reusing happens, IPVS will find the old > > entry and use it for the new one, behaving like a forced persistence. > > But if you consider a cluster with a heavy load of small connections, > > such reuse will happen often and may lead to a not optimal load > > balancing and might prevent a new node from getting a fair load. > > > > This patch introduces a new sysctl, conn_reuse_mode, that allows > > controlling how to proceed when port reuse is detected. The default > > value will allow rescheduling of new connections only if the old entry > > was in TIME_WAIT state for TCP or CLOSED for SCTP. > > > > Signed-off-by: Marcelo Ricardo Leitner <mleitner@xxxxxxxxxx> > > Thanks, looks good. Simon, please apply to ipvs-next. > > Signed-off-by: Julian Anastasov <ja@xxxxxx> Thanks, applied to ipvs-next. > > --- > > > > Notes: > > v1->v2: > > updated to add kfree(param->pe_data) in ip_vs_proc_conn() chunk > > > > Documentation/networking/ipvs-sysctl.txt | 21 ++++++++++++++++++++ > > include/net/ip_vs.h | 11 +++++++++++ > > net/netfilter/ipvs/ip_vs_core.c | 33 ++++++++++++++++++++++++++++---- > > net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++++ > > net/netfilter/ipvs/ip_vs_sync.c | 21 ++++++++++++++++++-- > > 5 files changed, 88 insertions(+), 6 deletions(-) > > > > diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt > > index 7a3c047295914cbc8c4273506a9b6d35246a1750..3ba709531adba970595251fa73d6d471ed14c5c1 100644 > > --- a/Documentation/networking/ipvs-sysctl.txt > > +++ b/Documentation/networking/ipvs-sysctl.txt > > @@ -22,6 +22,27 @@ backup_only - BOOLEAN > > If set, disable the director function while the server is > > in backup mode to avoid packet loops for DR/TUN methods. > > > > +conn_reuse_mode - INTEGER > > + 1 - default > > + > > + Controls how ipvs will deal with connections that are detected > > + port reuse. It is a bitmap, with the values being: > > + > > + 0: disable any special handling on port reuse. The new > > + connection will be delivered to the same real server that was > > + servicing the previous connection. This will effectively > > + disable expire_nodest_conn. > > + > > + bit 1: enable rescheduling of new connections when it is safe. > > + That is, whenever expire_nodest_conn and for TCP sockets, when > > + the connection is in TIME_WAIT state (which is only possible if > > + you use NAT mode). > > + > > + bit 2: it is bit 1 plus, for TCP connections, when connections > > + are in FIN_WAIT state, as this is the last state seen by load > > + balancer in Direct Routing mode. This bit helps on adding new > > + real servers to a very busy cluster. > > + > > conntrack - BOOLEAN > > 0 - disabled (default) > > not 0 - enabled > > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h > > index 615b20b585452111a25085890d8fa875657dbe76..6c7ee0ae7ef1694671e4b6af0906b2fa077f5c7c 100644 > > --- a/include/net/ip_vs.h > > +++ b/include/net/ip_vs.h > > @@ -924,6 +924,7 @@ struct netns_ipvs { > > int sysctl_nat_icmp_send; > > int sysctl_pmtu_disc; > > int sysctl_backup_only; > > + int sysctl_conn_reuse_mode; > > > > /* ip_vs_lblc */ > > int sysctl_lblc_expiration; > > @@ -1042,6 +1043,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) > > ipvs->sysctl_backup_only; > > } > > > > +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) > > +{ > > + return ipvs->sysctl_conn_reuse_mode; > > +} > > + > > #else > > > > static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) > > @@ -1109,6 +1115,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) > > return 0; > > } > > > > +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) > > +{ > > + return 1; > > +} > > + > > #endif > > > > /* IPVS core functions > > diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c > > index b87ca32efa0b4e6edc7f251c2c32c4ba3b55659c..3ec9b1a54024fa421f330cf1d0eeb67da9683127 100644 > > --- a/net/netfilter/ipvs/ip_vs_core.c > > +++ b/net/netfilter/ipvs/ip_vs_core.c > > @@ -1046,6 +1046,26 @@ static inline bool is_new_conn(const struct sk_buff *skb, > > } > > } > > > > +static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, > > + int conn_reuse_mode) > > +{ > > + /* Controlled (FTP DATA or persistence)? */ > > + if (cp->control) > > + return false; > > + > > + switch (cp->protocol) { > > + case IPPROTO_TCP: > > + return (cp->state == IP_VS_TCP_S_TIME_WAIT) || > > + ((conn_reuse_mode & 2) && > > + (cp->state == IP_VS_TCP_S_FIN_WAIT) && > > + (cp->flags & IP_VS_CONN_F_NOOUTPUT)); > > + case IPPROTO_SCTP: > > + return cp->state == IP_VS_SCTP_S_CLOSED; > > + default: > > + return false; > > + } > > +} > > + > > /* Handle response packets: rewrite addresses and send away... > > */ > > static unsigned int > > @@ -1585,6 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) > > struct ip_vs_conn *cp; > > int ret, pkts; > > struct netns_ipvs *ipvs; > > + int conn_reuse_mode; > > > > /* Already marked as IPVS request or reply? */ > > if (skb->ipvs_property) > > @@ -1653,10 +1674,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) > > */ > > cp = pp->conn_in_get(af, skb, &iph, 0); > > > > - if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && > > - unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && > > - is_new_conn(skb, &iph)) { > > - ip_vs_conn_expire_now(cp); > > + conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); > > + if (conn_reuse_mode && !iph.fragoffs && > > + is_new_conn(skb, &iph) && cp && > > + ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && > > + unlikely(!atomic_read(&cp->dest->weight))) || > > + unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) { > > + if (!atomic_read(&cp->n_control)) > > + ip_vs_conn_expire_now(cp); > > __ip_vs_conn_put(cp); > > cp = NULL; > > } > > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c > > index e55759056361c47ed1fcfa5c656541ba39bfd260..ec7f6f1e07cee1d15a6f839defc86aec8abd821e 100644 > > --- a/net/netfilter/ipvs/ip_vs_ctl.c > > +++ b/net/netfilter/ipvs/ip_vs_ctl.c > > @@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = { > > .mode = 0644, > > .proc_handler = proc_dointvec, > > }, > > + { > > + .procname = "conn_reuse_mode", > > + .maxlen = sizeof(int), > > + .mode = 0644, > > + .proc_handler = proc_dointvec, > > + }, > > #ifdef CONFIG_IP_VS_DEBUG > > { > > .procname = "debug_level", > > @@ -3732,6 +3738,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) > > ipvs->sysctl_pmtu_disc = 1; > > tbl[idx++].data = &ipvs->sysctl_pmtu_disc; > > tbl[idx++].data = &ipvs->sysctl_backup_only; > > + ipvs->sysctl_conn_reuse_mode = 1; > > + tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; > > > > > > ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); > > diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c > > index c47ffd7a0a709cb73834c84652f251960f25db79..f96229cdb6e184543b6b958575c08c5a3c1b4b72 100644 > > --- a/net/netfilter/ipvs/ip_vs_sync.c > > +++ b/net/netfilter/ipvs/ip_vs_sync.c > > @@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, > > struct ip_vs_conn *cp; > > struct netns_ipvs *ipvs = net_ipvs(net); > > > > - if (!(flags & IP_VS_CONN_F_TEMPLATE)) > > + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { > > cp = ip_vs_conn_in_get(param); > > - else > > + if (cp && ((cp->dport != dport) || > > + !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { > > + if (!(flags & IP_VS_CONN_F_INACTIVE)) { > > + ip_vs_conn_expire_now(cp); > > + __ip_vs_conn_put(cp); > > + cp = NULL; > > + } else { > > + /* This is the expiration message for the > > + * connection that was already replaced, so we > > + * just ignore it. > > + */ > > + __ip_vs_conn_put(cp); > > + kfree(param->pe_data); > > + return; > > + } > > + } > > + } else { > > cp = ip_vs_ct_in_get(param); > > + } > > > > if (cp) { > > /* Free pe_data */ > > -- > > 1.9.3 > > Regards > > -- > Julian Anastasov <ja@xxxxxx> > -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html