Based on Google's Maglev algorithm [1][2], this scheduler builds a lookup table in a way disruption is minimized when a change occurs. This helps in case of active/active setup without synchronization. Like for classic source hashing, this lookup table is used to assign connections to a real server. Both source address and port are used to compute the hash (unlike sh where this is optional). Weights are correctly handled. Unlike sh, servers with a weight of 0 are considered as absent. Also, unlike sh, when a server becomes unavailable due to a threshold, no fallback is possible: doing so would seriously impair the the usefulness of using a consistent hash. There is a small hack to detect when all real servers have a weight of 0. It relies on the fact it is not possible for the weight of a real server to change during the execution of the assignment. I believe this is the case as modifications through netlink are subject to a mutex, but the use of atomic_read() is unsettling. The value of 65537 for the hash table size is currently not modifiable at compile-time. This is the value suggested in the Maglev paper. Another possible value is 257 (for small tests) and 655373 (for very large setups). [1]: https://research.google.com/pubs/pub44824.html [2]: https://blog.acolyer.org/2016/03/21/maglev-a-fast-and-reliable-software-network-load-balancer/ Signed-off-by: Vincent Bernat <vincent@xxxxxxxxx> --- include/net/ip_vs.h | 27 ++++ net/netfilter/ipvs/Kconfig | 13 ++ net/netfilter/ipvs/Makefile | 1 + net/netfilter/ipvs/ip_vs_csh.c | 339 +++++++++++++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_sh.c | 32 +--- 5 files changed, 381 insertions(+), 31 deletions(-) create mode 100644 net/netfilter/ipvs/ip_vs_csh.c diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..2184b43b7320 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -195,6 +195,33 @@ static inline int ip_vs_addr_equal(int af, const union nf_inet_addr *a, return a->ip == b->ip; } +static inline __be16 ip_vs_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + #ifdef CONFIG_IP_VS_DEBUG #include <linux/net.h> diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index b32fb0dbe237..bfd091e020af 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -225,6 +225,19 @@ config IP_VS_SH If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_CSH + tristate "consistent source hashing scheduling" + ---help--- + The consistent source hashing scheduling algorithm assigns + network connections to the servers through looking up a + statically assigned hash table by their source IP addresses + and ports. The hash table is built to minimize disruption + when a server becomes unavailable. It relies on the Maglev + algorithm. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_SED tristate "shortest expected delay scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index c552993fa4b9..7d0badf86dfe 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_CSH) += ip_vs_csh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o diff --git a/net/netfilter/ipvs/ip_vs_csh.c b/net/netfilter/ipvs/ip_vs_csh.c new file mode 100644 index 000000000000..c70db61ba934 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_csh.c @@ -0,0 +1,339 @@ +/* + * IPVS: Consistent Hashing scheduling module using Google's Maglev + * + * Authors: Vincent Bernat <vincent@xxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The Maglev algorithm is a consistent hashing algorithm described in + * section 3.4 of "Maglev: A Fast and Reliable Software Network Load + * Balancer" (https://research.google.com/pubs/pub44824.html). + * + * The following pseudo-code from listing in page 6 is implemented + * using M = 65537. Weight is implemented by allowing servers to push + * their candidates several times at each turn. Currently, thresholds + * are ignored. + * + * Both source address and port are used for the hash. IPVS runs after + * fragment reassembly, so source port is always available. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> + +#define IP_VS_CSH_TAB_SIZE 65537 + +/* + * IPVS CSH bucket + */ +struct ip_vs_csh_bucket { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ + bool assigned; +}; + + +struct ip_vs_csh_state { + struct rcu_head rcu_head; + struct ip_vs_csh_bucket buckets[IP_VS_CSH_TAB_SIZE]; +}; + + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Returns hash value for IPVS CSH entry + */ +static inline unsigned int +ip_vs_csh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return hash_32(ntohs(port) + ntohl(addr_fold), 32) % IP_VS_CSH_TAB_SIZE; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_csh_get(struct ip_vs_service *svc, struct ip_vs_csh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_csh_hashkey(svc->af, addr, port); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + +/* + * For provided destination, return the "j"th element of its permutation. + */ +static inline u32 +ip_vs_csh_permutation(struct ip_vs_dest *d, int j) +{ + u32 offset, skip; + __be32 addr_fold = d->addr.ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (d->af == AF_INET6) + addr_fold = d->addr.ip6[0]^d->addr.ip6[1]^ + d->addr.ip6[2]^d->addr.ip6[3]; +#endif + addr_fold = ntohl(addr_fold) + ntohs(d->port); + offset = hash_32(addr_fold, 32) % IP_VS_CSH_TAB_SIZE; + skip = (hash_32(addr_fold + 1, 32) % (IP_VS_CSH_TAB_SIZE - 1)) + 1; + return (offset + j * skip) % IP_VS_CSH_TAB_SIZE; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_csh_flush(struct ip_vs_csh_state *s) +{ + int i; + struct ip_vs_csh_bucket *b; + struct ip_vs_dest *dest; + + b = &s->buckets[0]; + for (i=0; i<IP_VS_CSH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); + } + b++; + } +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_csh_reassign(struct ip_vs_csh_state *s, struct ip_vs_service *svc) +{ + int n, c, i, j; + struct ip_vs_csh_bucket *b; + struct list_head *p = &svc->destinations; + struct ip_vs_dest *dest, *olddest; + int num_dests = svc->num_dests; + int d_count, weight; + int *next = NULL; + + /* Special case: no real servers */ + if (list_empty(p)) { + ip_vs_csh_flush(s); + return 0; + } + + /* For each destination, reset the position in the permutation + * list. */ + next = kzalloc(sizeof(int) * num_dests, GFP_KERNEL); + if (next == NULL) + return -ENOMEM; + + /* For each bucket, flip the assigned bit: the destination has + * not been set. */ + for (n=0, b = &s->buckets[0]; + n<IP_VS_CSH_TAB_SIZE; + n++, b++) { + b->assigned = false; + } + + d_count = 0; + i = 0; + j = 0; + n = 0; + while (true) { + if (p == &svc->destinations) + p = p->next; + dest = list_entry(p, struct ip_vs_dest, n_list); + weight = atomic_read(&dest->weight); + + if (weight > 0) { + /* Find the next preferred bucket for the destination. */ + ip_vs_dest_hold(dest); + do { + c = ip_vs_csh_permutation(dest, next[i]); + b = &s->buckets[c]; + next[i]++; + } while (b->assigned); + + /* Assign the bucket. */ + b->assigned = 1; + olddest = rcu_dereference_protected(b->dest, 1); + if (olddest) + ip_vs_dest_put(olddest); + RCU_INIT_POINTER(b->dest, dest); + + IP_VS_DBG_BUF(6, "CSH: assigned c: %d dest: %s:%d weight: %d\n", + c, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->weight)); + if (++n == IP_VS_CSH_TAB_SIZE) break; + } + + if (++j == num_dests && n == 0) { + IP_VS_DBG(6, "CSH: all servers have 0 weight\n"); + ip_vs_csh_flush(s); + break; + } + + /* Don't move to next dest until filling weight */ + if (++d_count >= weight) { + p = p->next; + i = (i + 1) % num_dests; + d_count = 0; + } + } + + kfree(next); + return 0; +} + + +static int ip_vs_csh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_csh_state *s; + + /* allocate the SH table for this service */ + s = kzalloc(sizeof(struct ip_vs_csh_state), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + + svc->sched_data = s; + IP_VS_DBG(6, "CSH: hash table (memory=%zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_csh_bucket)*IP_VS_CSH_TAB_SIZE); + + /* assign the hash buckets with current dests */ + ip_vs_csh_reassign(s, svc); + + return 0; +} + + +static void ip_vs_csh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_csh_state *s = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_csh_flush(s); + + /* release the table itself */ + kfree_rcu(s, rcu_head); + IP_VS_DBG(6, "CSH: hash table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_csh_bucket)*IP_VS_CSH_TAB_SIZE); +} + + +static int ip_vs_csh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_csh_state *s = svc->sched_data; + + /* assign the hash buckets with the updated service */ + ip_vs_csh_reassign(s, svc); + + return 0; +} + +/* + * Consistent Source Hashing scheduling with Maglev + */ +static struct ip_vs_dest * +ip_vs_csh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_csh_state *s; + __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; + port = ip_vs_get_port(skb, iph); + + s = (struct ip_vs_csh_state *) svc->sched_data; + dest = ip_vs_csh_get(svc, s, hash_addr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "CSH: source IP address %s:%d --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, hash_addr), + ntohs(port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS CSH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_csh_scheduler = +{ + .name = "csh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_csh_scheduler.n_list), + .init_service = ip_vs_csh_init_svc, + .done_service = ip_vs_csh_done_svc, + .add_dest = ip_vs_csh_dest_changed, + .del_dest = ip_vs_csh_dest_changed, + .upd_dest = ip_vs_csh_dest_changed, + .schedule = ip_vs_csh_schedule, +}; + + +static int __init ip_vs_csh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_csh_scheduler); +} + + +static void __exit ip_vs_csh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_csh_scheduler); + synchronize_rcu(); +} + + +module_init(ip_vs_csh_init); +module_exit(ip_vs_csh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 16aaac6eedc9..b67565d12b6a 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -276,36 +276,6 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, } -/* Helper function to get port number */ -static inline __be16 -ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) -{ - __be16 _ports[2], *ports; - - /* At this point we know that we have a valid packet of some kind. - * Because ICMP packets are only guaranteed to have the first 8 - * bytes, let's just grab the ports. Fortunately they're in the - * same position for all three of the protocols we care about. - */ - switch (iph->protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_SCTP: - ports = skb_header_pointer(skb, iph->len, sizeof(_ports), - &_ports); - if (unlikely(!ports)) - return 0; - - if (likely(!ip_vs_iph_inverse(iph))) - return ports[0]; - else - return ports[1]; - default: - return 0; - } -} - - /* * Source Hashing scheduling */ @@ -323,7 +293,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) - port = ip_vs_sh_get_port(skb, iph); + port = ip_vs_get_port(skb, iph); s = (struct ip_vs_sh_state *) svc->sched_data; -- 2.16.3 -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html