1) Change table size to primes array 2) Add to check dest number > IP_VS_MH_TAB_SIZE 3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE 4) Replace large permutation array with dest_setup 5) Add gcd, rshift and calculate ds->turns Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx> --- net/netfilter/ipvs/ip_vs_mh.c | 291 ++++++++++++++++++++++++++---------------- 1 file changed, 178 insertions(+), 113 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index d1bf50a..c868778 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -5,12 +5,13 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> -#include <linux/vmalloc.h> #include <linux/skbuff.h> #include <net/ip_vs.h> #include <linux/siphash.h> +#include <linux/bitops.h> +#include <linux/gcd.h> #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ @@ -19,23 +20,36 @@ struct ip_vs_mh_lookup { struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; -/* for IPVS MH entry hash table */ -#ifndef CONFIG_IP_VS_MH_TAB_BITS -#define CONFIG_IP_VS_MH_TAB_BITS 8 +struct ip_vs_mh_dest_setup { + int offset; /* starting offset */ + int skip; /* skip */ + int perm; /* next_offset */ + int turns; /* weight / gcd() */ +}; + +/* Available prime numbers for MH table */ +static int primes[] = {251, 509, 1021, 2039, 4093, + 8191, 16381, 32749, 65521, 131071}; + +/* For IPVS MH entry hash table */ +#ifndef CONFIG_IP_VS_MH_TAB_INDEX +#define CONFIG_IP_VS_MH_TAB_INDEX 12 #endif -#define IP_VS_MH_TAB_BITS CONFIG_IP_VS_MH_TAB_BITS -#define IP_VS_MH_TAB_SIZE BIT(IP_VS_MH_TAB_BITS) -#define IP_VS_MH_LOOKUP_SIZE 65537 /* Must be prime number */ +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] struct ip_vs_mh_state { - struct rcu_head rcu_head; - struct ip_vs_mh_lookup lookup[IP_VS_MH_LOOKUP_SIZE]; - unsigned int **permutation; - hsiphash_key_t hash1, hash2; + struct rcu_head rcu_head; + struct ip_vs_mh_lookup *lookup; + struct ip_vs_mh_dest_setup *dest_setup; + hsiphash_key_t hash1, hash2; + int gcd; + int rshift; }; -static inline void -ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2) +static inline void generate_hash_secret(hsiphash_key_t *hash1, + hsiphash_key_t *hash2) { hash1->key[0] = 2654435761UL; hash1->key[1] = 2654435761UL; @@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest) return hsiphash(&v, sizeof(v), key); } -static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, - struct ip_vs_service *svc) +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) { - int i, j; struct list_head *p; + struct ip_vs_mh_dest_setup *ds; struct ip_vs_dest *dest; - unsigned int offset, skip; + int lw; bool empty; p = &svc->destinations; @@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, if (empty) return 0; - /* extending permutation table to 2d arrays */ - for (i = 1; i < svc->num_dests; i++) - s->permutation[i] = s->permutation[i - 1] + - IP_VS_MH_LOOKUP_SIZE; + /* If gcd is smaller then 1, all last_weights of dest are zero. + * so skip permutation for the dests. + */ + if (s->gcd < 1) + return 0; - i = 0; + /* Set dest_setup for the dests permutation */ + ds = &s->dest_setup[0]; while ((p = p->next) != &svc->destinations) { dest = list_entry(p, struct ip_vs_dest, n_list); - offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port, - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE; - skip = ip_vs_mh_hashkey(svc->af, &dest->addr, - dest->port, &s->hash2, 0) % - (IP_VS_MH_LOOKUP_SIZE - 1) + 1; - - for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) { - s->permutation[i][j] = (offset + (j * skip)) % - IP_VS_MH_LOOKUP_SIZE; - } - i++; + + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash1, 0) % + IP_VS_MH_TAB_SIZE; + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash2, 0) % + (IP_VS_MH_TAB_SIZE - 1) + 1; + ds->perm = ds->offset % IP_VS_MH_TAB_SIZE; + + lw = atomic_read(&dest->last_weight); + ds->turns = (lw > 1) ? lw / s->gcd : lw; + ds++; } return 0; } -static inline int -ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc) +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) { - int i, ret, dw_count; - unsigned int *next; - unsigned int **pmt; + int i, n; struct ip_vs_mh_lookup *entry, *l; struct list_head *p; struct ip_vs_dest *dest; - unsigned int n, c; + struct ip_vs_mh_dest_setup *ds; + int c, rs, dt_count; bool empty; - ret = 0; - next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int), - GFP_KERNEL); - if (!next) - return -ENOMEM; - - entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry), + entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry), GFP_KERNEL); - if (!entry) { - ret = -ENOMEM; - goto err_alloc; - } + if (!entry) + return -ENOMEM; - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) RCU_INIT_POINTER(entry[i].dest, NULL); p = &svc->destinations; @@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, if (empty) goto out; + /* If gcd is smaller then 1, the weight is zero for all dests. + * so skip population for MH table and fills the table with NULLs. + */ + if (s->gcd < 1) + goto out; + n = 0; - dw_count = 0; - pmt = s->permutation; - while (n < IP_VS_MH_LOOKUP_SIZE) { + rs = s->rshift; + dt_count = 0; + while (n < IP_VS_MH_TAB_SIZE) { if (p == &svc->destinations) p = p->next; - i = 0; + ds = &s->dest_setup[0]; while (p != &svc->destinations) { - c = pmt[i][next[i]]; + /* ignore added server with zero weight */ + if (ds->turns < 1) { + p = p->next; + ds++; + continue; + } + c = ds->perm; while (entry[c].dest) { - next[i] = next[i] + 1; - c = pmt[i][next[i]]; + /* Add skip, mod IP_VS_MH_TAB_SIZE */ + ds->perm += ds->skip; + if (ds->perm >= IP_VS_MH_TAB_SIZE) + ds->perm -= IP_VS_MH_TAB_SIZE; + c = ds->perm; } dest = list_entry(p, struct ip_vs_dest, n_list); RCU_INIT_POINTER(entry[c].dest, dest); - next[i] = next[i] + 1; - n++; - if (n == IP_VS_MH_LOOKUP_SIZE) - break; + if (++n == IP_VS_MH_TAB_SIZE) + goto out; - if (++dw_count >= atomic_read(&dest->weight)) { + if (++dt_count >= (ds->turns >> rs)) { + dt_count = 0; p = p->next; - dw_count = 0; - i++; + ds++; } } } out: l = &s->lookup[0]; - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) { - dest = rcu_dereference_protected(entry[i].dest, 1); + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(l->dest, 1); if (dest) ip_vs_dest_put(dest); if (empty) { RCU_INIT_POINTER(l->dest, NULL); } else { + dest = rcu_dereference_protected(entry[i].dest, 1); ip_vs_dest_hold(dest); RCU_INIT_POINTER(l->dest, dest); @@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, } kfree(entry); -err_alloc: - kfree(next); - return ret; + return 0; } /* Get ip_vs_dest associated with supplied parameters. */ @@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) - % IP_VS_MH_LOOKUP_SIZE; + % IP_VS_MH_TAB_SIZE; struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); return (!dest || is_unavailable(dest)) ? NULL : dest; } -/* As ip_vs_mh_get, but with fallback if selected server is unavailable - */ +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ static inline struct ip_vs_dest * ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, const union nf_inet_addr *addr, __be16 port) @@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, /* first try the dest it's supposed to go to */ ihash = ip_vs_mh_hashkey(svc->af, addr, port, - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE; + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; dest = rcu_dereference(s->lookup[ihash].dest); if (!dest) return NULL; @@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, /* if the original dest is unavailable, loop around the table * starting from ihash to find a new dest */ - for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) { - roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE; + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, - roffset) % IP_VS_MH_LOOKUP_SIZE; + roffset) % IP_VS_MH_TAB_SIZE; dest = rcu_dereference(s->lookup[hash].dest); if (!dest) break; @@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s) struct ip_vs_dest *dest; l = &s->lookup[0]; - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) { + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { dest = rcu_dereference_protected(l->dest, 1); if (dest) { ip_vs_dest_put(dest); @@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s) } } -/* Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc) +/* Assign all the hash buckets of the specified table with the service. */ +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) { int ret; - /* ip_vs_mh_reassign is responsible for assigning - * and releasing s->permutation table - */ - s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *)); - if (!s->permutation) - return -ENOMEM; + if (svc->num_dests > IP_VS_MH_TAB_SIZE) + return -EINVAL; - s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE * - sizeof(unsigned int)); - if (!s->permutation[0]) { - ret = -ENOMEM; - goto err_alloc; - } + s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE, + sizeof(struct ip_vs_mh_dest_setup), + GFP_KERNEL); + if (!s->dest_setup) + return -ENOMEM; - ret = ip_vs_mh_permutate(s, svc); - if (ret < 0) - goto err_out; + ip_vs_mh_permutate(s, svc); ret = ip_vs_mh_populate(s, svc); if (ret < 0) - goto err_out; + goto out; IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n", IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); -err_out: - vfree(s->permutation[0]); -err_alloc: - vfree(s->permutation); +out: + kfree(s->dest_setup); return ret; } +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->last_weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g; +} + +/* To avoid assigning huge weight for the MH table, + * calculate shift value with gcd. + */ +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + int mw, shift; + + /* If gcd is smaller then 1, the weight is zero for all dests. + * skip calculation rshift. + */ + if (gcd < 1) + return 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->last_weight); + if (new_weight > weight) + weight = new_weight; + } + + /* Because gcd is greater than zero, + * the maximum weight and gcd are always greater than zero + */ + mw = weight / gcd; + + /* shift = occupied bits of weight/gcd - MH highest bits */ + shift = fls(mw) - IP_VS_MH_TAB_BITS; + return (shift >= 0) ? shift : 0; +} + static int ip_vs_mh_init_svc(struct ip_vs_service *svc) { struct ip_vs_mh_state *s; @@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc) if (!s) return -ENOMEM; - svc->sched_data = s; + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), + GFP_KERNEL); + if (!s->lookup) { + kfree_rcu(s, rcu_head); + return -ENOMEM; + } + + generate_hash_secret(&s->hash1, &s->hash2); + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + svc->sched_data = s; IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) allocated for current service\n", - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE); - - ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2); + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); /* permutate & populate with current dests */ - ip_vs_mh_reassign(s, svc); - - return 0; + return ip_vs_mh_reassign(s, svc); } static void ip_vs_mh_done_svc(struct ip_vs_service *svc) @@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc) /* release the table itself */ kfree_rcu(s, rcu_head); IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE); + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); } static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, @@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, { struct ip_vs_mh_state *s = svc->sched_data; - /* assign the hash buckets with the updated service */ - ip_vs_mh_reassign(s, svc); + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); - return 0; + /* assign the hash buckets with the updated service */ + return ip_vs_mh_reassign(s, svc); } /* Helper function to get port number */ -- 1.8.3.1 -- Inju Song NAVER Corporation -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html