Hi, > > I just don't understand why rejecting a client connection when there are > > servers available is desirable behaviour. > The problem is that every move leads to problems: > - add/remove destination => mapping is changed for all dests > - set weight to 0 and allow fallback => mapping is changed for > two connections from same IP Fair enough, although I would guess two connections from the same IP going to different servers wouldn't be an issue in many cases. > - persistence implemented with SH => fallback is risky. Usually, > we use expire_quiescent_template for such cases when persistence > is used. Can you elaborate on what you mean by "risky" here? > - same mapping for many directors => fallback is desired when > config is same on all directors and persistence behaviour is > not desired. Indeed. > Not sure if we can apply the expire_quiescent_template flag to > the SH scheduler to control fallback. But then it's controlled by a sysctl, not per virtual server, which is something we didn't want, I believe. Are you happy for me to go ahead and add the per-service scheduler flags (IP_VS_SVC_F_SCHED1, etc.), like you suggested previously? At the moment, the patch looks like this (pending a decision on how to enable the features): diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 0df269d..abd8ed6 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -48,6 +48,10 @@ #include <net/ip_vs.h> +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> + /* * IPVS SH bucket @@ -74,7 +78,9 @@ struct ip_vs_sh_state { /* * Returns hash value for IPVS SH entry */ -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_sh_hashkey(int af, + const union nf_inet_addr *addr, __be16 port, + unsigned int offset) { __be32 addr_fold = addr->ip; @@ -83,7 +89,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; } @@ -91,9 +98,11 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr) +ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) { - return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest); + return rcu_dereference( + s->buckets[ip_vs_sh_hashkey(af, addr, port, offset)].dest); } @@ -232,17 +241,43 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_dest *dest; struct ip_vs_sh_state *s; struct ip_vs_iphdr iph; + unsigned int offset; + unsigned int found; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + /* XXX if L4 hash */ + if (0) + port = ip_vs_sh_get_port(svc, skb, iph); + else + port = 0; + s = (struct ip_vs_sh_state *) svc->sched_data; - dest = ip_vs_sh_get(svc->af, s, &iph.saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { + /* XXX if fallback */ + if (0) { + found = 0; + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + dest = ip_vs_sh_get(svc->af, s, &iph.saddr, + port, offset); + if (!is_available(dest)) { + IP_VS_DBG_BUF(6, "SH: selected unavailable" + "server %s:%d, retrying with offset" + "%d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + offset); + } else { + found = 1; + break; + } + } + } else { + dest = ip_vs_sh_get(svc->af, s, &iph.saddr, port, 0); + found = 1; + } + if (!found || !is_available(dest)) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } @@ -255,6 +290,50 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return dest; } +/* + * Helper function to determine if server is available + */ +static inline int +is_available(struct ip_vs_dest *dest) +{ + return (!dest || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + atomic_read(&dest->weight) <= 0 || + is_overloaded(dest)) +} + +/* + * Helper function to get port number + */ +static inline __be16 +ip_vs_sh_get_port(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr iph) +{ + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (svc->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph); + port = sh->source; + break; + default: + port = 0; + } + + return port; +} + /* * IPVS SH Scheduler structure Alex -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html