Hello, Thanks for your reviewing the patches. On Wed, Feb 28, 2018 at 01:34:26AM +0200, Julian Anastasov wrote: > > Hello, > > On Sun, 25 Feb 2018, Inju Song wrote: > > > 1) Change table size to primes array > > > > 2) Add to check dest number > IP_VS_MH_TAB_SIZE > > > > 3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE > > > > 4) Replace large permutation array with dest_setup > > > > 5) Add gcd, rshift and calculate ds->turns > > Makefile and Kconfig should be a last patch, otherwise > on bisecting we will fail to compile. > Ok. I will move Makefile and Kconfig patch to last of the patchsets in the next v3 patchsets. Then do I include a patch about last_weight again? > > Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx> > > --- > > net/netfilter/ipvs/ip_vs_mh.c | 291 ++++++++++++++++++++++++++---------------- > > 1 file changed, 178 insertions(+), 113 deletions(-) > > > > diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c > > index d1bf50a..c868778 100644 > > --- a/net/netfilter/ipvs/ip_vs_mh.c > > +++ b/net/netfilter/ipvs/ip_vs_mh.c > > There must be license info at the beginning. > May be I will use GPLv1 license. Should I write the license info like other source(ie. the beginning of ip_vs_sh.c)? > > @@ -5,12 +5,13 @@ > > #include <linux/slab.h> > > #include <linux/module.h> > > #include <linux/kernel.h> > > -#include <linux/vmalloc.h> > > #include <linux/skbuff.h> > > > > #include <net/ip_vs.h> > > > > #include <linux/siphash.h> > > +#include <linux/bitops.h> > > +#include <linux/gcd.h> > > > > #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ > > #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ > > @@ -19,23 +20,36 @@ struct ip_vs_mh_lookup { > > struct ip_vs_dest __rcu *dest; /* real server (cache) */ > > }; > > > > -/* for IPVS MH entry hash table */ > > -#ifndef CONFIG_IP_VS_MH_TAB_BITS > > -#define CONFIG_IP_VS_MH_TAB_BITS 8 > > +struct ip_vs_mh_dest_setup { > > + int offset; /* starting offset */ > > + int skip; /* skip */ > > + int perm; /* next_offset */ > > + int turns; /* weight / gcd() */ > > +}; > > + > > +/* Available prime numbers for MH table */ > > +static int primes[] = {251, 509, 1021, 2039, 4093, > > + 8191, 16381, 32749, 65521, 131071}; > > + > > +/* For IPVS MH entry hash table */ > > +#ifndef CONFIG_IP_VS_MH_TAB_INDEX > > +#define CONFIG_IP_VS_MH_TAB_INDEX 12 > > #endif > > -#define IP_VS_MH_TAB_BITS CONFIG_IP_VS_MH_TAB_BITS > > -#define IP_VS_MH_TAB_SIZE BIT(IP_VS_MH_TAB_BITS) > > -#define IP_VS_MH_LOOKUP_SIZE 65537 /* Must be prime number */ > > +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) > > +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) > > +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] > > > > struct ip_vs_mh_state { > > - struct rcu_head rcu_head; > > - struct ip_vs_mh_lookup lookup[IP_VS_MH_LOOKUP_SIZE]; > > - unsigned int **permutation; > > - hsiphash_key_t hash1, hash2; > > + struct rcu_head rcu_head; > > + struct ip_vs_mh_lookup *lookup; > > + struct ip_vs_mh_dest_setup *dest_setup; > > + hsiphash_key_t hash1, hash2; > > + int gcd; > > + int rshift; > > }; > > > > -static inline void > > -ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2) > > +static inline void generate_hash_secret(hsiphash_key_t *hash1, > > + hsiphash_key_t *hash2) > > { > > hash1->key[0] = 2654435761UL; > > hash1->key[1] = 2654435761UL; > > @@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest) > > return hsiphash(&v, sizeof(v), key); > > } > > > > -static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > - struct ip_vs_service *svc) > > +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > + struct ip_vs_service *svc) > > { > > - int i, j; > > struct list_head *p; > > + struct ip_vs_mh_dest_setup *ds; > > struct ip_vs_dest *dest; > > - unsigned int offset, skip; > > + int lw; > > bool empty; > > > > p = &svc->destinations; > > @@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > if (empty) > > return 0; > > Above empty and list_empty can be removed, we can > rely on the gcd < 1 check. > Ok. I think gcd < 1 check is enough for the destinations empty case too. > > > > - /* extending permutation table to 2d arrays */ > > - for (i = 1; i < svc->num_dests; i++) > > - s->permutation[i] = s->permutation[i - 1] + > > - IP_VS_MH_LOOKUP_SIZE; > > + /* If gcd is smaller then 1, all last_weights of dest are zero. > > + * so skip permutation for the dests. > > + */ > > + if (s->gcd < 1) > > + return 0; > > > > - i = 0; > > + /* Set dest_setup for the dests permutation */ > > + ds = &s->dest_setup[0]; > > while ((p = p->next) != &svc->destinations) { > > dest = list_entry(p, struct ip_vs_dest, n_list); > > - offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port, > > - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE; > > - skip = ip_vs_mh_hashkey(svc->af, &dest->addr, > > - dest->port, &s->hash2, 0) % > > - (IP_VS_MH_LOOKUP_SIZE - 1) + 1; > > - > > - for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) { > > - s->permutation[i][j] = (offset + (j * skip)) % > > - IP_VS_MH_LOOKUP_SIZE; > > - } > > - i++; > > + > > + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, > > + dest->port, &s->hash1, 0) % > > + IP_VS_MH_TAB_SIZE; > > + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, > > + dest->port, &s->hash2, 0) % > > + (IP_VS_MH_TAB_SIZE - 1) + 1; > > + ds->perm = ds->offset % IP_VS_MH_TAB_SIZE; > > offset is already < IP_VS_MH_TAB_SIZE, no need to mod. > It will be removed. > > + > > + lw = atomic_read(&dest->last_weight); > > + ds->turns = (lw > 1) ? lw / s->gcd : lw; > > May be we can move the rshift to the calcs here? > > ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); > > or optimized in different way but we must be sure > that rshift will not clear turns when lw > 0. > I think it is very good optimized way. I will move rshift to turns calcs using this way. > > + ds++; > > } > > > > return 0; > > } > > > > -static inline int > > -ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc) > > +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, > > + struct ip_vs_service *svc) > > { > > - int i, ret, dw_count; > > - unsigned int *next; > > - unsigned int **pmt; > > + int i, n; > > struct ip_vs_mh_lookup *entry, *l; > > struct list_head *p; > > struct ip_vs_dest *dest; > > - unsigned int n, c; > > + struct ip_vs_mh_dest_setup *ds; > > + int c, rs, dt_count; > > bool empty; > > > > - ret = 0; > > - next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int), > > - GFP_KERNEL); > > - if (!next) > > - return -ENOMEM; > > - > > - entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry), > > + entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry), > > GFP_KERNEL); > > We can avoid this allocation if s->gcd < 1 because > entry is unused in this case. > > > - if (!entry) { > > - ret = -ENOMEM; > > - goto err_alloc; > > - } > > + if (!entry) > > + return -ENOMEM; > > > > - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) > > + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) > > RCU_INIT_POINTER(entry[i].dest, NULL); > > > > p = &svc->destinations; > > @@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > if (empty) > > goto out; > > We can avoid above list_empty and to use > empty = (s->gcd < 1) instead. Or may be 'empty' var is not > need at all if we use bitmap and populate dests one by one, > see below... > > > > > + /* If gcd is smaller then 1, the weight is zero for all dests. > > + * so skip population for MH table and fills the table with NULLs. > > + */ > > + if (s->gcd < 1) > > + goto out; > > if (s->gcd < 1) > goto reset; > Yes, and reset label will be added for this case. > > + > > n = 0; > > - dw_count = 0; > > - pmt = s->permutation; > > - while (n < IP_VS_MH_LOOKUP_SIZE) { > > + rs = s->rshift; > > + dt_count = 0; > > + while (n < IP_VS_MH_TAB_SIZE) { > > if (p == &svc->destinations) > > p = p->next; > > > > - i = 0; > > + ds = &s->dest_setup[0]; > > while (p != &svc->destinations) { > > - c = pmt[i][next[i]]; > > + /* ignore added server with zero weight */ > > + if (ds->turns < 1) { > > + p = p->next; > > + ds++; > > + continue; > > + } > > > > + c = ds->perm; > > while (entry[c].dest) { > > - next[i] = next[i] + 1; > > - c = pmt[i][next[i]]; > > + /* Add skip, mod IP_VS_MH_TAB_SIZE */ > > + ds->perm += ds->skip; > > + if (ds->perm >= IP_VS_MH_TAB_SIZE) > > + ds->perm -= IP_VS_MH_TAB_SIZE; > > + c = ds->perm; > > } > > > > dest = list_entry(p, struct ip_vs_dest, n_list); > > RCU_INIT_POINTER(entry[c].dest, dest); > > entry is not visible to RCU readers, so normal access > is enough. It can be even a bitmap to reduce the memory usage > 32 or 64 times, eg. we can use 'table' instead of 'entry': > > unsigned long *table; > table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), > sizeof(unsigned long)); > > then 'while (entry[c].dest)' becomes 'while (test_bit(c, table))', > 'entry[c].dest = dest;' becomes '__set_bit(c, table);' > > And we can directly populate the entry: > > dest = rcu_dereference_protected(s->lookup[c].dest, 1); > dest2 = list_entry(p, struct ip_vs_dest, n_list); > if (dest != dest2) { > if (dest) > ip_vs_dest_put(dest); > ip_vs_dest_hold(dest2); > RCU_INIT_POINTER(s->lookup[c].dest, dest2); > } > Ok. Replacing arrays with bitmap will be useful to reduce memory usuage so It will be replaced in the next patches. > > > > - next[i] = next[i] + 1; > > - n++; > > - if (n == IP_VS_MH_LOOKUP_SIZE) > > - break; > > + if (++n == IP_VS_MH_TAB_SIZE) > > + goto out; > > > > - if (++dw_count >= atomic_read(&dest->weight)) { > > + if (++dt_count >= (ds->turns >> rs)) { > > turns can be already rshift-ed in ip_vs_mh_permutate. > > > + dt_count = 0; > > p = p->next; > > - dw_count = 0; > > - i++; > > + ds++; > > } > > } > > } > > Here we should jump to kfree(table): > > goto out; > > > > > out: > > out: becomes reset: > > This loop will remain for the 's->gcd < 1' case only: > > > l = &s->lookup[0]; > > - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) { > > - dest = rcu_dereference_protected(entry[i].dest, 1); > > + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { > > + dest = rcu_dereference_protected(l->dest, 1); > > if (dest) > > ip_vs_dest_put(dest); > > if (empty) { > > RCU_INIT_POINTER(l->dest, NULL); > > } else { > > This block will be removed: > > > + dest = rcu_dereference_protected(entry[i].dest, 1); > > ip_vs_dest_hold(dest); > > RCU_INIT_POINTER(l->dest, dest); > > > > @@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > } > > > > out: label comes here to kfree bitmap 'table'. > > > kfree(entry); > > -err_alloc: > > - kfree(next); > > - return ret; > > + return 0; > > } > > > > /* Get ip_vs_dest associated with supplied parameters. */ > > @@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > const union nf_inet_addr *addr, __be16 port) > > { > > unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) > > - % IP_VS_MH_LOOKUP_SIZE; > > + % IP_VS_MH_TAB_SIZE; > > struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); > > > > return (!dest || is_unavailable(dest)) ? NULL : dest; > > } > > > > -/* As ip_vs_mh_get, but with fallback if selected server is unavailable > > - */ > > +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ > > static inline struct ip_vs_dest * > > ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, > > const union nf_inet_addr *addr, __be16 port) > > @@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > > > /* first try the dest it's supposed to go to */ > > ihash = ip_vs_mh_hashkey(svc->af, addr, port, > > - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE; > > + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; > > dest = rcu_dereference(s->lookup[ihash].dest); > > if (!dest) > > return NULL; > > @@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s, > > /* if the original dest is unavailable, loop around the table > > * starting from ihash to find a new dest > > */ > > - for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) { > > - roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE; > > + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { > > + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; > > hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, > > - roffset) % IP_VS_MH_LOOKUP_SIZE; > > + roffset) % IP_VS_MH_TAB_SIZE; > > dest = rcu_dereference(s->lookup[hash].dest); > > if (!dest) > > break; > > @@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s) > > struct ip_vs_dest *dest; > > > > l = &s->lookup[0]; > > - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) { > > + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { > > dest = rcu_dereference_protected(l->dest, 1); > > if (dest) { > > ip_vs_dest_put(dest); > > @@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s) > > } > > } > > > > -/* Assign all the hash buckets of the specified table with the service. > > - */ > > -static int > > -ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc) > > +/* Assign all the hash buckets of the specified table with the service. */ > > +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, > > + struct ip_vs_service *svc) > > { > > int ret; > > > > - /* ip_vs_mh_reassign is responsible for assigning > > - * and releasing s->permutation table > > - */ > > - s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *)); > > - if (!s->permutation) > > - return -ENOMEM; > > + if (svc->num_dests > IP_VS_MH_TAB_SIZE) > > + return -EINVAL; > > > > - s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE * > > - sizeof(unsigned int)); > > - if (!s->permutation[0]) { > > - ret = -ENOMEM; > > - goto err_alloc; > > - } > > + s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE, > > + sizeof(struct ip_vs_mh_dest_setup), > > + GFP_KERNEL); > > For s->dest_setup we need to allocate svc->num_dests entries, > IP_VS_MH_TAB_SIZE is too large. But if num_dests=0 we should avoid > the allocation, then after kfree(s->dest_setup) we should use > s->dest_setup = NULL to avoid double free. > Yep, It will be fixed. > > + if (!s->dest_setup) > > + return -ENOMEM; > > > > - ret = ip_vs_mh_permutate(s, svc); > > - if (ret < 0) > > - goto err_out; > > + ip_vs_mh_permutate(s, svc); > > > > ret = ip_vs_mh_populate(s, svc); > > if (ret < 0) > > - goto err_out; > > + goto out; > > > > IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n", > > IP_VS_DBG_ADDR(svc->af, &svc->addr), > > ntohs(svc->port)); > > > > -err_out: > > - vfree(s->permutation[0]); > > -err_alloc: > > - vfree(s->permutation); > > +out: > > + kfree(s->dest_setup); > > return ret; > > } > > > > +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) > > +{ > > + struct ip_vs_dest *dest; > > + int weight; > > + int g = 0; > > + > > + list_for_each_entry(dest, &svc->destinations, n_list) { > > + weight = atomic_read(&dest->last_weight); > > + if (weight > 0) { > > + if (g > 0) > > + g = gcd(weight, g); > > + else > > + g = weight; > > + } > > + } > > + return g; > > +} > > + > > +/* To avoid assigning huge weight for the MH table, > > + * calculate shift value with gcd. > > + */ > > +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) > > +{ > > + struct ip_vs_dest *dest; > > + int new_weight, weight = 0; > > + int mw, shift; > > + > > + /* If gcd is smaller then 1, the weight is zero for all dests. > > + * skip calculation rshift. > > + */ > > + if (gcd < 1) > > + return 0; > > + > > + list_for_each_entry(dest, &svc->destinations, n_list) { > > + new_weight = atomic_read(&dest->last_weight); > > + if (new_weight > weight) > > + weight = new_weight; > > + } > > + > > + /* Because gcd is greater than zero, > > + * the maximum weight and gcd are always greater than zero > > + */ > > + mw = weight / gcd; > > + > > + /* shift = occupied bits of weight/gcd - MH highest bits */ > > + shift = fls(mw) - IP_VS_MH_TAB_BITS; > > + return (shift >= 0) ? shift : 0; > > +} > > + > > static int ip_vs_mh_init_svc(struct ip_vs_service *svc) > > { > > struct ip_vs_mh_state *s; > > @@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc) > > if (!s) > > return -ENOMEM; > > > > - svc->sched_data = s; > > + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), > > + GFP_KERNEL); > > + if (!s->lookup) { > > + kfree_rcu(s, rcu_head); > > kfree(s) is ok here. > > > + return -ENOMEM; > > + } > > + > > + generate_hash_secret(&s->hash1, &s->hash2); > > + s->gcd = ip_vs_mh_gcd_weight(svc); > > + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); > > > > + svc->sched_data = s; > > IP_VS_DBG(6, > > "MH lookup table (memory=%zdbytes) allocated for current service\n", > > - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE); > > - > > - ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2); > > + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); > > > > /* permutate & populate with current dests */ > > - ip_vs_mh_reassign(s, svc); > > - > > - return 0; > > + return ip_vs_mh_reassign(s, svc); > > } > > > > static void ip_vs_mh_done_svc(struct ip_vs_service *svc) > > @@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc) > > /* release the table itself */ > > kfree_rcu(s, rcu_head); > > s->lookup is not freed, we should free both > s and s->lookup in new RCU callback. It will need a change in > ip_vs_mh_cleanup: synchronize_rcu() -> rcu_barrier(), so that > module unload to wait for the callback to complete. > See ip_vs_lblcr.c how call_rcu() is called to schedule > callback. We will need: > > static void ip_vs_mh_state_free(struct rcu_head *head) > { > struct ip_vs_mh_state *s; > > s = container_of(head, struct ip_vs_mh_state, rcu_head); > kfree(s->lookup); > kfree(s); > } > I missed freeing s->lookup. Thanks for letting me know. I will apply these reviews. > > IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", > > - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE); > > + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); > > } > > > > static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, > > @@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, > > { > > struct ip_vs_mh_state *s = svc->sched_data; > > > > - /* assign the hash buckets with the updated service */ > > - ip_vs_mh_reassign(s, svc); > > + s->gcd = ip_vs_mh_gcd_weight(svc); > > + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); > > > > - return 0; > > + /* assign the hash buckets with the updated service */ > > + return ip_vs_mh_reassign(s, svc); > > } > > > > /* Helper function to get port number */ > > -- > > 1.8.3.1 > > Regards > > -- > Julian Anastasov <ja@xxxxxx> > > Regards -- Inju Song NAVER Corporation -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html