Hello, On Sun, 25 Feb 2018, Inju Song wrote: > 1) Change table size to primes array > > 2) Add to check dest number > IP_VS_MH_TAB_SIZE > > 3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE > > 4) Replace large permutation array with dest_setup > > 5) Add gcd, rshift and calculate ds->turns Makefile and Kconfig should be a last patch, otherwise on bisecting we will fail to compile. > Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx> > --- > net/netfilter/ipvs/ip_vs_mh.c | 291 ++++++++++++++++++++++++++---------------- > 1 file changed, 178 insertions(+), 113 deletions(-) > > diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c > index d1bf50a..c868778 100644 > --- a/net/netfilter/ipvs/ip_vs_mh.c > +++ b/net/netfilter/ipvs/ip_vs_mh.c There must be license info at the beginning. > @@ -5,12 +5,13 @@ > #include <linux/slab.h> > #include <linux/module.h> > #include <linux/kernel.h> > -#include <linux/vmalloc.h> > #include <linux/skbuff.h> > > #include <net/ip_vs.h> > > #include <linux/siphash.h> > +#include <linux/bitops.h> > +#include <linux/gcd.h> > > #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ > #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ > @@ -19,23 +20,36 @@ struct ip_vs_mh_lookup { > struct ip_vs_dest __rcu *dest; /* real server (cache) */ > }; > > -/* for IPVS MH entry hash table */ > -#ifndef CONFIG_IP_VS_MH_TAB_BITS > -#define CONFIG_IP_VS_MH_TAB_BITS 8 > +struct ip_vs_mh_dest_setup { > + int offset; /* starting offset */ > + int skip; /* skip */ > + int perm; /* next_offset */ > + int turns; /* weight / gcd() */ > +}; > + > +/* Available prime numbers for MH table */ > +static int primes[] = {251, 509, 1021, 2039, 4093, > + 8191, 16381, 32749, 65521, 131071}; > + > +/* For IPVS MH entry hash table */ > +#ifndef CONFIG_IP_VS_MH_TAB_INDEX > +#define CONFIG_IP_VS_MH_TAB_INDEX 12 > #endif > -#define IP_VS_MH_TAB_BITS CONFIG_IP_VS_MH_TAB_BITS > -#define IP_VS_MH_TAB_SIZE BIT(IP_VS_MH_TAB_BITS) > -#define IP_VS_MH_LOOKUP_SIZE 65537 /* Must be prime number */ > +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) > +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) > +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] > > struct ip_vs_mh_state { > - struct rcu_head rcu_head; > - struct ip_vs_mh_lookup lookup[IP_VS_MH_LOOKUP_SIZE]; > - unsigned int **permutation; > - hsiphash_key_t hash1, hash2; > + struct rcu_head rcu_head; > + struct ip_vs_mh_lookup *lookup; > + struct ip_vs_mh_dest_setup *dest_setup; > + hsiphash_key_t hash1, hash2; > + int gcd; > + int rshift; > }; > > -static inline void > -ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2) > +static inline void generate_hash_secret(hsiphash_key_t *hash1, > + hsiphash_key_t *hash2) > { > hash1->key[0] = 2654435761UL; > hash1->key[1] = 2654435761UL; > @@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest) > return hsiphash(&v, sizeof(v), key); > } > > -static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > - struct ip_vs_service *svc) > +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > + struct ip_vs_service *svc) > { > - int i, j; > struct list_head *p; > + struct ip_vs_mh_dest_setup *ds; > struct ip_vs_dest *dest; > - unsigned int offset, skip; > + int lw; > bool empty; > > p = &svc->destinations; > @@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > if (empty) > return 0; Above empty and list_empty can be removed, we can rely on the gcd < 1 check. > > - /* extending permutation table to 2d arrays */ > - for (i = 1; i < svc->num_dests; i++) > - s->permutation[i] = s->permutation[i - 1] + > - IP_VS_MH_LOOKUP_SIZE; > + /* If gcd is smaller then 1, all last_weights of dest are zero. > + * so skip permutation for the dests. > + */ > + if (s->gcd < 1) > + return 0; > > - i = 0; > + /* Set dest_setup for the dests permutation */ > + ds = &s->dest_setup[0]; > while ((p = p->next) != &svc->destinations) { > dest = list_entry(p, struct ip_vs_dest, n_list); > - offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port, > - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE; > - skip = ip_vs_mh_hashkey(svc->af, &dest->addr, > - dest->port, &s->hash2, 0) % > - (IP_VS_MH_LOOKUP_SIZE - 1) + 1; > - > - for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) { > - s->permutation[i][j] = (offset + (j * skip)) % > - IP_VS_MH_LOOKUP_SIZE; > - } > - i++; > + > + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, > + dest->port, &s->hash1, 0) % > + IP_VS_MH_TAB_SIZE; > + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, > + dest->port, &s->hash2, 0) % > + (IP_VS_MH_TAB_SIZE - 1) + 1; > + ds->perm = ds->offset % IP_VS_MH_TAB_SIZE; offset is already < IP_VS_MH_TAB_SIZE, no need to mod. > + > + lw = atomic_read(&dest->last_weight); > + ds->turns = (lw > 1) ? lw / s->gcd : lw; May be we can move the rshift to the calcs here? ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); or optimized in different way but we must be sure that rshift will not clear turns when lw > 0. > + ds++; > } > > return 0; > } > > -static inline int > -ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc) > +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, > + struct ip_vs_service *svc) > { > - int i, ret, dw_count; > - unsigned int *next; > - unsigned int **pmt; > + int i, n; > struct ip_vs_mh_lookup *entry, *l; > struct list_head *p; > struct ip_vs_dest *dest; > - unsigned int n, c; > + struct ip_vs_mh_dest_setup *ds; > + int c, rs, dt_count; > bool empty; > > - ret = 0; > - next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int), > - GFP_KERNEL); > - if (!next) > - return -ENOMEM; > - > - entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry), > + entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry), > GFP_KERNEL); We can avoid this allocation if s->gcd < 1 because entry is unused in this case. > - if (!entry) { > - ret = -ENOMEM; > - goto err_alloc; > - } > + if (!entry) > + return -ENOMEM; > > - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) > + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) > RCU_INIT_POINTER(entry[i].dest, NULL); > > p = &svc->destinations; > @@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > if (empty) > goto out; We can avoid above list_empty and to use empty = (s->gcd < 1) instead. Or may be 'empty' var is not need at all if we use bitmap and populate dests one by one, see below... > > + /* If gcd is smaller then 1, the weight is zero for all dests. > + * so skip population for MH table and fills the table with NULLs. > + */ > + if (s->gcd < 1) > + goto out; if (s->gcd < 1) goto reset; > + > n = 0; > - dw_count = 0; > - pmt = s->permutation; > - while (n < IP_VS_MH_LOOKUP_SIZE) { > + rs = s->rshift; > + dt_count = 0; > + while (n < IP_VS_MH_TAB_SIZE) { > if (p == &svc->destinations) > p = p->next; > > - i = 0; > + ds = &s->dest_setup[0]; > while (p != &svc->destinations) { > - c = pmt[i][next[i]]; > + /* ignore added server with zero weight */ > + if (ds->turns < 1) { > + p = p->next; > + ds++; > + continue; > + } > > + c = ds->perm; > while (entry[c].dest) { > - next[i] = next[i] + 1; > - c = pmt[i][next[i]]; > + /* Add skip, mod IP_VS_MH_TAB_SIZE */ > + ds->perm += ds->skip; > + if (ds->perm >= IP_VS_MH_TAB_SIZE) > + ds->perm -= IP_VS_MH_TAB_SIZE; > + c = ds->perm; > } > > dest = list_entry(p, struct ip_vs_dest, n_list); > RCU_INIT_POINTER(entry[c].dest, dest); entry is not visible to RCU readers, so normal access is enough. It can be even a bitmap to reduce the memory usage 32 or 64 times, eg. we can use 'table' instead of 'entry': unsigned long *table; table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), sizeof(unsigned long)); then 'while (entry[c].dest)' becomes 'while (test_bit(c, table))', 'entry[c].dest = dest;' becomes '__set_bit(c, table);' And we can directly populate the entry: dest = rcu_dereference_protected(s->lookup[c].dest, 1); dest2 = list_entry(p, struct ip_vs_dest, n_list); if (dest != dest2) { if (dest) ip_vs_dest_put(dest); ip_vs_dest_hold(dest2); RCU_INIT_POINTER(s->lookup[c].dest, dest2); } > > - next[i] = next[i] + 1; > - n++; > - if (n == IP_VS_MH_LOOKUP_SIZE) > - break; > + if (++n == IP_VS_MH_TAB_SIZE) > + goto out; > > - if (++dw_count >= atomic_read(&dest->weight)) { > + if (++dt_count >= (ds->turns >> rs)) { turns can be already rshift-ed in ip_vs_mh_permutate. > + dt_count = 0; > p = p->next; > - dw_count = 0; > - i++; > + ds++; > } > } > } Here we should jump to kfree(table): goto out; > > out: out: becomes reset: This loop will remain for the 's->gcd < 1' case only: > l = &s->lookup[0]; > - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) { > - dest = rcu_dereference_protected(entry[i].dest, 1); > + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { > + dest = rcu_dereference_protected(l->dest, 1); > if (dest) > ip_vs_dest_put(dest); > if (empty) { > RCU_INIT_POINTER(l->dest, NULL); > } else { This block will be removed: > + dest = rcu_dereference_protected(entry[i].dest, 1); > ip_vs_dest_hold(dest); > RCU_INIT_POINTER(l->dest, dest); > > @@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > } > out: label comes here to kfree bitmap 'table'. > kfree(entry); > -err_alloc: > - kfree(next); > - return ret; > + return 0; > } > > /* Get ip_vs_dest associated with supplied parameters. */ > @@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > const union nf_inet_addr *addr, __be16 port) > { > unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) > - % IP_VS_MH_LOOKUP_SIZE; > + % IP_VS_MH_TAB_SIZE; > struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); > > return (!dest || is_unavailable(dest)) ? NULL : dest; > } > > -/* As ip_vs_mh_get, but with fallback if selected server is unavailable > - */ > +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ > static inline struct ip_vs_dest * > ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, > const union nf_inet_addr *addr, __be16 port) > @@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > /* first try the dest it's supposed to go to */ > ihash = ip_vs_mh_hashkey(svc->af, addr, port, > - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE; > + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; > dest = rcu_dereference(s->lookup[ihash].dest); > if (!dest) > return NULL; > @@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > /* if the original dest is unavailable, loop around the table > * starting from ihash to find a new dest > */ > - for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) { > - roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE; > + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { > + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; > hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, > - roffset) % IP_VS_MH_LOOKUP_SIZE; > + roffset) % IP_VS_MH_TAB_SIZE; > dest = rcu_dereference(s->lookup[hash].dest); > if (!dest) > break; > @@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s) > struct ip_vs_dest *dest; > > l = &s->lookup[0]; > - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) { > + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { > dest = rcu_dereference_protected(l->dest, 1); > if (dest) { > ip_vs_dest_put(dest); > @@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s) > } > } > > -/* Assign all the hash buckets of the specified table with the service. > - */ > -static int > -ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc) > +/* Assign all the hash buckets of the specified table with the service. */ > +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, > + struct ip_vs_service *svc) > { > int ret; > > - /* ip_vs_mh_reassign is responsible for assigning > - * and releasing s->permutation table > - */ > - s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *)); > - if (!s->permutation) > - return -ENOMEM; > + if (svc->num_dests > IP_VS_MH_TAB_SIZE) > + return -EINVAL; > > - s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE * > - sizeof(unsigned int)); > - if (!s->permutation[0]) { > - ret = -ENOMEM; > - goto err_alloc; > - } > + s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE, > + sizeof(struct ip_vs_mh_dest_setup), > + GFP_KERNEL); For s->dest_setup we need to allocate svc->num_dests entries, IP_VS_MH_TAB_SIZE is too large. But if num_dests=0 we should avoid the allocation, then after kfree(s->dest_setup) we should use s->dest_setup = NULL to avoid double free. > + if (!s->dest_setup) > + return -ENOMEM; > > - ret = ip_vs_mh_permutate(s, svc); > - if (ret < 0) > - goto err_out; > + ip_vs_mh_permutate(s, svc); > > ret = ip_vs_mh_populate(s, svc); > if (ret < 0) > - goto err_out; > + goto out; > > IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n", > IP_VS_DBG_ADDR(svc->af, &svc->addr), > ntohs(svc->port)); > > -err_out: > - vfree(s->permutation[0]); > -err_alloc: > - vfree(s->permutation); > +out: > + kfree(s->dest_setup); > return ret; > } > > +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) > +{ > + struct ip_vs_dest *dest; > + int weight; > + int g = 0; > + > + list_for_each_entry(dest, &svc->destinations, n_list) { > + weight = atomic_read(&dest->last_weight); > + if (weight > 0) { > + if (g > 0) > + g = gcd(weight, g); > + else > + g = weight; > + } > + } > + return g; > +} > + > +/* To avoid assigning huge weight for the MH table, > + * calculate shift value with gcd. > + */ > +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) > +{ > + struct ip_vs_dest *dest; > + int new_weight, weight = 0; > + int mw, shift; > + > + /* If gcd is smaller then 1, the weight is zero for all dests. > + * skip calculation rshift. > + */ > + if (gcd < 1) > + return 0; > + > + list_for_each_entry(dest, &svc->destinations, n_list) { > + new_weight = atomic_read(&dest->last_weight); > + if (new_weight > weight) > + weight = new_weight; > + } > + > + /* Because gcd is greater than zero, > + * the maximum weight and gcd are always greater than zero > + */ > + mw = weight / gcd; > + > + /* shift = occupied bits of weight/gcd - MH highest bits */ > + shift = fls(mw) - IP_VS_MH_TAB_BITS; > + return (shift >= 0) ? shift : 0; > +} > + > static int ip_vs_mh_init_svc(struct ip_vs_service *svc) > { > struct ip_vs_mh_state *s; > @@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc) > if (!s) > return -ENOMEM; > > - svc->sched_data = s; > + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), > + GFP_KERNEL); > + if (!s->lookup) { > + kfree_rcu(s, rcu_head); kfree(s) is ok here. > + return -ENOMEM; > + } > + > + generate_hash_secret(&s->hash1, &s->hash2); > + s->gcd = ip_vs_mh_gcd_weight(svc); > + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); > > + svc->sched_data = s; > IP_VS_DBG(6, > "MH lookup table (memory=%zdbytes) allocated for current service\n", > - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE); > - > - ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2); > + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); > > /* permutate & populate with current dests */ > - ip_vs_mh_reassign(s, svc); > - > - return 0; > + return ip_vs_mh_reassign(s, svc); > } > > static void ip_vs_mh_done_svc(struct ip_vs_service *svc) > @@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc) > /* release the table itself */ > kfree_rcu(s, rcu_head); s->lookup is not freed, we should free both s and s->lookup in new RCU callback. It will need a change in ip_vs_mh_cleanup: synchronize_rcu() -> rcu_barrier(), so that module unload to wait for the callback to complete. See ip_vs_lblcr.c how call_rcu() is called to schedule callback. We will need: static void ip_vs_mh_state_free(struct rcu_head *head) { struct ip_vs_mh_state *s; s = container_of(head, struct ip_vs_mh_state, rcu_head); kfree(s->lookup); kfree(s); } > IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", > - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE); > + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); > } > > static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, > @@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, > { > struct ip_vs_mh_state *s = svc->sched_data; > > - /* assign the hash buckets with the updated service */ > - ip_vs_mh_reassign(s, svc); > + s->gcd = ip_vs_mh_gcd_weight(svc); > + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); > > - return 0; > + /* assign the hash buckets with the updated service */ > + return ip_vs_mh_reassign(s, svc); > } > > /* Helper function to get port number */ > -- > 1.8.3.1 Regards -- Julian Anastasov <ja@xxxxxx> -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html