[PATCHv2 ipvs-next 3/3] netfilter: ipvs: Refactor Maglev hashing scheduler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



  1) Change table size to primes array

  2) Add to check dest number > IP_VS_MH_TAB_SIZE

  3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE

  4) Replace large permutation array with dest_setup

  5) Add gcd, rshift and calculate ds->turns

Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx>
---
 net/netfilter/ipvs/ip_vs_mh.c | 291 ++++++++++++++++++++++++++----------------
 1 file changed, 178 insertions(+), 113 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index d1bf50a..c868778 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -5,12 +5,13 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/vmalloc.h>
 #include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
 #include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
 
 #define IP_VS_SVC_F_SCHED_MH_FALLBACK	IP_VS_SVC_F_SCHED1 /* MH fallback */
 #define IP_VS_SVC_F_SCHED_MH_PORT	IP_VS_SVC_F_SCHED2 /* MH use port */
@@ -19,23 +20,36 @@ struct ip_vs_mh_lookup {
 	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
 };
 
-/* for IPVS MH entry hash table */
-#ifndef CONFIG_IP_VS_MH_TAB_BITS
-#define CONFIG_IP_VS_MH_TAB_BITS        8
+struct ip_vs_mh_dest_setup {
+	int	offset; /* starting offset */
+	int	skip;	/* skip */
+	int	perm;	/* next_offset */
+	int	turns;	/* weight / gcd() */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+		       8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX	12
 #endif
-#define IP_VS_MH_TAB_BITS               CONFIG_IP_VS_MH_TAB_BITS
-#define IP_VS_MH_TAB_SIZE               BIT(IP_VS_MH_TAB_BITS)
-#define IP_VS_MH_LOOKUP_SIZE		65537 /* Must be prime number */
+#define IP_VS_MH_TAB_BITS		(CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX		(CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
 
 struct ip_vs_mh_state {
-	struct rcu_head		rcu_head;
-	struct ip_vs_mh_lookup  lookup[IP_VS_MH_LOOKUP_SIZE];
-	unsigned int		**permutation;
-	hsiphash_key_t		hash1, hash2;
+	struct rcu_head			rcu_head;
+	struct ip_vs_mh_lookup		*lookup;
+	struct ip_vs_mh_dest_setup	*dest_setup;
+	hsiphash_key_t			hash1, hash2;
+	int				gcd;
+	int				rshift;
 };
 
-static inline void
-ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2)
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+					hsiphash_key_t *hash2)
 {
 	hash1->key[0] = 2654435761UL;
 	hash1->key[1] = 2654435761UL;
@@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest)
 	return hsiphash(&v, sizeof(v), key);
 }
 
-static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
-				     struct ip_vs_service *svc)
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+			      struct ip_vs_service *svc)
 {
-	int i, j;
 	struct list_head *p;
+	struct ip_vs_mh_dest_setup *ds;
 	struct ip_vs_dest *dest;
-	unsigned int offset, skip;
+	int lw;
 	bool empty;
 
 	p = &svc->destinations;
@@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 	if (empty)
 		return 0;
 
-	/* extending permutation table to 2d arrays */
-	for (i = 1; i < svc->num_dests; i++)
-		s->permutation[i] = s->permutation[i - 1] +
-				    IP_VS_MH_LOOKUP_SIZE;
+	/* If gcd is smaller then 1, all last_weights of dest are zero.
+	 * so skip permutation for the dests.
+	 */
+	if (s->gcd < 1)
+		return 0;
 
-	i = 0;
+	/* Set dest_setup for the dests permutation */
+	ds = &s->dest_setup[0];
 	while ((p = p->next) != &svc->destinations) {
 		dest = list_entry(p, struct ip_vs_dest, n_list);
-		offset = ip_vs_mh_hashkey(svc->af, &dest->addr,	dest->port,
-					  &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
-		skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
-					dest->port, &s->hash2, 0) %
-					(IP_VS_MH_LOOKUP_SIZE - 1) + 1;
-
-		for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) {
-			s->permutation[i][j] = (offset + (j * skip)) %
-						IP_VS_MH_LOOKUP_SIZE;
-		}
-		i++;
+
+		ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+					      dest->port, &s->hash1, 0) %
+					      IP_VS_MH_TAB_SIZE;
+		ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+					    dest->port, &s->hash2, 0) %
+					    (IP_VS_MH_TAB_SIZE - 1) + 1;
+		ds->perm = ds->offset % IP_VS_MH_TAB_SIZE;
+
+		lw = atomic_read(&dest->last_weight);
+		ds->turns = (lw > 1) ? lw / s->gcd : lw;
+		ds++;
 	}
 
 	return 0;
 }
 
-static inline int
-ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+			     struct ip_vs_service *svc)
 {
-	int i, ret, dw_count;
-	unsigned int *next;
-	unsigned int **pmt;
+	int i, n;
 	struct ip_vs_mh_lookup *entry, *l;
 	struct list_head *p;
 	struct ip_vs_dest *dest;
-	unsigned int n, c;
+	struct ip_vs_mh_dest_setup *ds;
+	int c, rs, dt_count;
 	bool empty;
 
-	ret = 0;
-	next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int),
-		       GFP_KERNEL);
-	if (!next)
-		return -ENOMEM;
-
-	entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry),
+	entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry),
 			GFP_KERNEL);
-	if (!entry) {
-		ret = -ENOMEM;
-		goto err_alloc;
-	}
+	if (!entry)
+		return -ENOMEM;
 
-	for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++)
+	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++)
 		RCU_INIT_POINTER(entry[i].dest, NULL);
 
 	p = &svc->destinations;
@@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 	if (empty)
 		goto out;
 
+	/* If gcd is smaller then 1, the weight is zero for all dests.
+	 * so skip population for MH table and fills the table with NULLs.
+	 */
+	if (s->gcd < 1)
+		goto out;
+
 	n = 0;
-	dw_count = 0;
-	pmt = s->permutation;
-	while (n < IP_VS_MH_LOOKUP_SIZE) {
+	rs = s->rshift;
+	dt_count = 0;
+	while (n < IP_VS_MH_TAB_SIZE) {
 		if (p == &svc->destinations)
 			p = p->next;
 
-		i = 0;
+		ds = &s->dest_setup[0];
 		while (p != &svc->destinations) {
-			c = pmt[i][next[i]];
+			/* ignore added server with zero weight */
+			if (ds->turns < 1) {
+				p = p->next;
+				ds++;
+				continue;
+			}
 
+			c = ds->perm;
 			while (entry[c].dest) {
-				next[i] = next[i] + 1;
-				c = pmt[i][next[i]];
+				/* Add skip, mod IP_VS_MH_TAB_SIZE */
+				ds->perm += ds->skip;
+				if (ds->perm >= IP_VS_MH_TAB_SIZE)
+					ds->perm -= IP_VS_MH_TAB_SIZE;
+				c = ds->perm;
 			}
 
 			dest = list_entry(p, struct ip_vs_dest, n_list);
 			RCU_INIT_POINTER(entry[c].dest, dest);
 
-			next[i] = next[i] + 1;
-			n++;
-			if (n == IP_VS_MH_LOOKUP_SIZE)
-				break;
+			if (++n == IP_VS_MH_TAB_SIZE)
+				goto out;
 
-			if (++dw_count >= atomic_read(&dest->weight)) {
+			if (++dt_count >= (ds->turns >> rs)) {
+				dt_count = 0;
 				p = p->next;
-				dw_count = 0;
-				i++;
+				ds++;
 			}
 		}
 	}
 
 out:
 	l = &s->lookup[0];
-	for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
-		dest = rcu_dereference_protected(entry[i].dest, 1);
+	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+		dest = rcu_dereference_protected(l->dest, 1);
 		if (dest)
 			ip_vs_dest_put(dest);
 		if (empty) {
 			RCU_INIT_POINTER(l->dest, NULL);
 		} else {
+			dest = rcu_dereference_protected(entry[i].dest, 1);
 			ip_vs_dest_hold(dest);
 			RCU_INIT_POINTER(l->dest, dest);
 
@@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 	}
 
 	kfree(entry);
-err_alloc:
-	kfree(next);
-	return ret;
+	return 0;
 }
 
 /* Get ip_vs_dest associated with supplied parameters. */
@@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 	     const union nf_inet_addr *addr, __be16 port)
 {
 	unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
-					     % IP_VS_MH_LOOKUP_SIZE;
+					     % IP_VS_MH_TAB_SIZE;
 	struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
 
 	return (!dest || is_unavailable(dest)) ? NULL : dest;
 }
 
-/* As ip_vs_mh_get, but with fallback if selected server is unavailable
- */
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
 static inline struct ip_vs_dest *
 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
 		      const union nf_inet_addr *addr, __be16 port)
@@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 
 	/* first try the dest it's supposed to go to */
 	ihash = ip_vs_mh_hashkey(svc->af, addr, port,
-				 &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
+				 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
 	dest = rcu_dereference(s->lookup[ihash].dest);
 	if (!dest)
 		return NULL;
@@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 	/* if the original dest is unavailable, loop around the table
 	 * starting from ihash to find a new dest
 	 */
-	for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) {
-		roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE;
+	for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+		roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
 		hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
-					roffset) % IP_VS_MH_LOOKUP_SIZE;
+					roffset) % IP_VS_MH_TAB_SIZE;
 		dest = rcu_dereference(s->lookup[hash].dest);
 		if (!dest)
 			break;
@@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
 	struct ip_vs_dest *dest;
 
 	l = &s->lookup[0];
-	for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
+	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
 		dest = rcu_dereference_protected(l->dest, 1);
 		if (dest) {
 			ip_vs_dest_put(dest);
@@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
 	}
 }
 
-/* Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+			     struct ip_vs_service *svc)
 {
 	int ret;
 
-	/* ip_vs_mh_reassign is responsible for assigning
-	 * and releasing s->permutation table
-	 */
-	s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *));
-	if (!s->permutation)
-		return -ENOMEM;
+	if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+		return -EINVAL;
 
-	s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE *
-				    sizeof(unsigned int));
-	if (!s->permutation[0]) {
-		ret = -ENOMEM;
-		goto err_alloc;
-	}
+	s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE,
+				sizeof(struct ip_vs_mh_dest_setup),
+				GFP_KERNEL);
+	if (!s->dest_setup)
+		return -ENOMEM;
 
-	ret = ip_vs_mh_permutate(s, svc);
-	if (ret < 0)
-		goto err_out;
+	ip_vs_mh_permutate(s, svc);
 
 	ret = ip_vs_mh_populate(s, svc);
 	if (ret < 0)
-		goto err_out;
+		goto out;
 
 	IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n",
 		      IP_VS_DBG_ADDR(svc->af, &svc->addr),
 		      ntohs(svc->port));
 
-err_out:
-	vfree(s->permutation[0]);
-err_alloc:
-	vfree(s->permutation);
+out:
+	kfree(s->dest_setup);
 	return ret;
 }
 
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight;
+	int g = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		weight = atomic_read(&dest->last_weight);
+		if (weight > 0) {
+			if (g > 0)
+				g = gcd(weight, g);
+			else
+				g = weight;
+		}
+	}
+	return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+	struct ip_vs_dest *dest;
+	int new_weight, weight = 0;
+	int mw, shift;
+
+	/* If gcd is smaller then 1, the weight is zero for all dests.
+	 * skip calculation rshift.
+	 */
+	if (gcd < 1)
+		return 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		new_weight = atomic_read(&dest->last_weight);
+		if (new_weight > weight)
+			weight = new_weight;
+	}
+
+	/* Because gcd is greater than zero,
+	 * the maximum weight and gcd are always greater than zero
+	 */
+	mw = weight / gcd;
+
+	/* shift = occupied bits of weight/gcd - MH highest bits */
+	shift = fls(mw) - IP_VS_MH_TAB_BITS;
+	return (shift >= 0) ? shift : 0;
+}
+
 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
 {
 	struct ip_vs_mh_state *s;
@@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
 	if (!s)
 		return -ENOMEM;
 
-	svc->sched_data = s;
+	s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+			    GFP_KERNEL);
+	if (!s->lookup) {
+		kfree_rcu(s, rcu_head);
+		return -ENOMEM;
+	}
+
+	generate_hash_secret(&s->hash1, &s->hash2);
+	s->gcd = ip_vs_mh_gcd_weight(svc);
+	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 
+	svc->sched_data = s;
 	IP_VS_DBG(6,
 		  "MH lookup table (memory=%zdbytes) allocated for current service\n",
-		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
-
-	ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2);
+		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 
 	/* permutate & populate with current dests */
-	ip_vs_mh_reassign(s, svc);
-
-	return 0;
+	return ip_vs_mh_reassign(s, svc);
 }
 
 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
@@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
 	/* release the table itself */
 	kfree_rcu(s, rcu_head);
 	IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
-		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
+		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 }
 
 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
@@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
 {
 	struct ip_vs_mh_state *s = svc->sched_data;
 
-	/* assign the hash buckets with the updated service */
-	ip_vs_mh_reassign(s, svc);
+	s->gcd = ip_vs_mh_gcd_weight(svc);
+	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 
-	return 0;
+	/* assign the hash buckets with the updated service */
+	return ip_vs_mh_reassign(s, svc);
 }
 
 /* Helper function to get port number */
-- 
1.8.3.1


-- 
Inju Song
NAVER Corporation

--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Filesystem Devel]     [Linux NFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [X.Org]

  Powered by Linux