[PATCH 2/3] IPVS: add wlib & wlip schedulers

Chris Caputo <ccaputo@xxxxxxx> · Tue, 20 Jan 2015 23:21:26 +0000 (UTC)

On Tue, 20 Jan 2015, Julian Anastasov wrote:
> > +                      (u64)dr * (u64)lwgt < (u64)lr * (u64)dwgt ||
[...]
> > +  	                   (dr == lr && dwgt > lwgt)) {
> 
> 	Above check is redundant.

I accepted your feedback and applied it to the below, except for this 
item.  I believe if dr and lr are zero (no traffic), we still want to 
choose the higher weight, thus a separate comparison is needed.

Thanks,
Chris

From: Chris Caputo <ccaputo@xxxxxxx> 

IPVS wlib (Weighted Least Incoming Byterate) and wlip (Weighted Least Incoming 
Packetrate) schedulers, updated for 3.19-rc5.

Signed-off-by: Chris Caputo <ccaputo@xxxxxxx>
---
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/Kconfig linux-3.19-rc5/net/netfilter/ipvs/Kconfig

--- linux-3.19-rc5-stock/net/netfilter/ipvs/Kconfig	2015-01-18 06:02:20.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/Kconfig	2015-01-20 08:08:28.883080285 +0000
@@ -240,6 +240,26 @@ config	IP_VS_NQ
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_WLIB
+	tristate "weighted least incoming byterate scheduling"
+	---help---
+	  The weighted least incoming byterate scheduling algorithm directs
+	  network connections to the server with the least incoming byterate
+	  normalized by the server weight.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_WLIP
+	tristate "weighted least incoming packetrate scheduling"
+	---help---
+	  The weighted least incoming packetrate scheduling algorithm directs
+	  network connections to the server with the least incoming packetrate
+	  normalized by the server weight.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
 comment 'IPVS SH scheduler'
 
 config IP_VS_SH_TAB_BITS
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/Makefile linux-3.19-rc5/net/netfilter/ipvs/Makefile
--- linux-3.19-rc5-stock/net/netfilter/ipvs/Makefile	2015-01-18 06:02:20.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/Makefile	2015-01-20 08:08:28.883080285 +0000
@@ -33,6 +33,8 @@ obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_WLIB) += ip_vs_wlib.o
+obj-$(CONFIG_IP_VS_WLIP) += ip_vs_wlip.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlib.c linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlib.c
--- linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlib.c	1970-01-01 00:00:00.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlib.c	2015-01-20 08:09:00.177816054 +0000
@@ -0,0 +1,166 @@
+/* IPVS:        Weighted Least Incoming Byterate Scheduling module
+ *
+ * Authors:     Chris Caputo <ccaputo@xxxxxxx> based on code by:
+ *
+ *                  Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx>
+ *                  Peter Kese <peter.kese@xxxxxx>
+ *                  Julian Anastasov <ja@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c.
+ *
+ */
+
+/* The WLIB algorithm uses the results of the estimator's inbps
+ * calculations to determine which real server has the lowest incoming
+ * byterate.
+ *
+ * Real server weight is factored into the calculation.  An example way to
+ * use this is if you have one server that can handle 100 Mbps of input and
+ * another that can handle 1 Gbps you could set the weights to be 100 and 1000
+ * respectively.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+static int
+ip_vs_wlib_init_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+static int
+ip_vs_wlib_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+	struct list_head *p;
+
+	spin_lock_bh(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	/* dest is already unlinked, so p->prev is not valid but
+	 * p->next is valid, use it to reach previous entry.
+	 */
+	if (p == &dest->n_list)
+		svc->sched_data = p->next->prev;
+	spin_unlock_bh(&svc->sched_lock);
+	return 0;
+}
+
+/* Weighted Least Incoming Byterate scheduling */
+static struct ip_vs_dest *
+ip_vs_wlib_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+		    struct ip_vs_iphdr *iph)
+{
+	struct list_head *p;
+	struct ip_vs_dest *dest, *last, *least = NULL;
+	int pass = 0;
+	u64 dr, lr = -1;
+	u32 dwgt, lwgt = 0;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/* We calculate the load of each dest server as follows:
+	 *        (dest inbps rate) / dest->weight
+	 *
+	 * The comparison of dr*lwght < lr*dwght is equivalent to that of
+	 * dr/dwght < lr/lwght if every weight is larger than zero.
+	 *
+	 * A server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 *
+	 * In case of inactivity, highest weight is winner.  And if that still makes
+	 * for a tie, round robin is used (which is why we remember our last
+	 * starting location in the linked list).
+	 */
+
+	spin_lock_bh(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+	do {
+		list_for_each_entry_continue_rcu(dest,
+						 &svc->destinations,
+						 n_list) {
+			dwgt = (u32)atomic_read(&dest->weight);
+			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+			    dwgt > 0) {
+				spin_lock(&dest->stats.lock);
+				/* estimator's scaling doesn't matter */
+				dr = dest->stats.est.inbps;
+				spin_unlock(&dest->stats.lock);
+
+				if (!least ||
+				    dr * lwgt < lr * dwgt ||
+				    (!dr && !lr && dwgt > lwgt)) {
+					least = dest;
+					lr = dr;
+					lwgt = dwgt;
+				}
+			}
+
+			if (dest == last)
+				goto stop;
+		}
+		pass++;
+		/* Previous dest could be unlinked, do not loop forever.
+		 * If we stay at head there is no need for 2nd pass.
+		 */
+	} while (pass < 2 && p != &svc->destinations);
+
+stop:
+	if (least)
+		svc->sched_data = &least->n_list;
+
+	spin_unlock_bh(&svc->sched_lock);
+
+	if (least) {
+		IP_VS_DBG_BUF(6,
+			      "WLIB: server %s:%u activeconns %d refcnt %d weight %d\n",
+			      IP_VS_DBG_ADDR(least->af, &least->addr),
+			      ntohs(least->port),
+			      atomic_read(&least->activeconns),
+			      atomic_read(&least->refcnt),
+			      atomic_read(&least->weight));
+	} else {
+		ip_vs_scheduler_err(svc, "no destination available");
+	}
+
+	return least;
+}
+
+static struct ip_vs_scheduler ip_vs_wlib_scheduler = {
+	.name =			"wlib",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_wlib_scheduler.n_list),
+	.init_service =		ip_vs_wlib_init_svc,
+	.add_dest =		NULL,
+	.del_dest =		ip_vs_wlib_del_dest,
+	.schedule =		ip_vs_wlib_schedule,
+};
+
+static int __init ip_vs_wlib_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_wlib_scheduler);
+}
+
+static void __exit ip_vs_wlib_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wlib_scheduler);
+	synchronize_rcu();
+}
+
+module_init(ip_vs_wlib_init);
+module_exit(ip_vs_wlib_cleanup);
+MODULE_LICENSE("GPL");
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlip.c linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlip.c
--- linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlip.c	1970-01-01 00:00:00.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlip.c	2015-01-20 08:09:07.456126624 +0000
@@ -0,0 +1,166 @@
+/* IPVS:        Weighted Least Incoming Packetrate Scheduling module
+ *
+ * Authors:     Chris Caputo <ccaputo@xxxxxxx> based on code by:
+ *
+ *                  Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx>
+ *                  Peter Kese <peter.kese@xxxxxx>
+ *                  Julian Anastasov <ja@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c.
+ *
+ */
+
+/* The WLIP algorithm uses the results of the estimator's inpps
+ * calculations to determine which real server has the lowest incoming
+ * packetrate.
+ *
+ * Real server weight is factored into the calculation.  An example way to
+ * use this is if you have one server that can handle 10 Kpps of input and
+ * another that can handle 100 Kpps you could set the weights to be 10 and 100
+ * respectively.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+static int
+ip_vs_wlip_init_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+static int
+ip_vs_wlip_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+	struct list_head *p;
+
+	spin_lock_bh(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	/* dest is already unlinked, so p->prev is not valid but
+	 * p->next is valid, use it to reach previous entry.
+	 */
+	if (p == &dest->n_list)
+		svc->sched_data = p->next->prev;
+	spin_unlock_bh(&svc->sched_lock);
+	return 0;
+}
+
+/* Weighted Least Incoming Packetrate scheduling */
+static struct ip_vs_dest *
+ip_vs_wlip_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+		    struct ip_vs_iphdr *iph)
+{
+	struct list_head *p;
+	struct ip_vs_dest *dest, *last, *least = NULL;
+	int pass = 0;
+	u32 dr, lr = -1;
+	u32 dwgt, lwgt = 0;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/* We calculate the load of each dest server as follows:
+	 *        (dest inpps rate) / dest->weight
+	 *
+	 * The comparison of dr*lwght < lr*dwght is equivalent to that of
+	 * dr/dwght < lr/lwght if every weight is larger than zero.
+	 *
+	 * A server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 *
+	 * In case of inactivity, highest weight is winner.  And if that still makes
+	 * for a tie, round robin is used (which is why we remember our last
+	 * starting location in the linked list).
+	 */
+
+	spin_lock_bh(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+	do {
+		list_for_each_entry_continue_rcu(dest,
+						 &svc->destinations,
+						 n_list) {
+			dwgt = (u32)atomic_read(&dest->weight);
+			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+			    dwgt > 0) {
+				spin_lock(&dest->stats.lock);
+				/* estimator's scaling doesn't matter */
+				dr = dest->stats.est.inpps;
+				spin_unlock(&dest->stats.lock);
+
+				if (!least ||
+				    (u64)dr * lwgt < (u64)lr * dwgt ||
+				    (!dr && !lr && dwgt > lwgt)) {
+					least = dest;
+					lr = dr;
+					lwgt = dwgt;
+				}
+			}
+
+			if (dest == last)
+				goto stop;
+		}
+		pass++;
+		/* Previous dest could be unlinked, do not loop forever.
+		 * If we stay at head there is no need for 2nd pass.
+		 */
+	} while (pass < 2 && p != &svc->destinations);
+
+stop:
+	if (least)
+		svc->sched_data = &least->n_list;
+
+	spin_unlock_bh(&svc->sched_lock);
+
+	if (least) {
+		IP_VS_DBG_BUF(6,
+			      "WLIP: server %s:%u activeconns %d refcnt %d weight %d\n",
+			      IP_VS_DBG_ADDR(least->af, &least->addr),
+			      ntohs(least->port),
+			      atomic_read(&least->activeconns),
+			      atomic_read(&least->refcnt),
+			      atomic_read(&least->weight));
+	} else {
+		ip_vs_scheduler_err(svc, "no destination available");
+	}
+
+	return least;
+}
+
+static struct ip_vs_scheduler ip_vs_wlip_scheduler = {
+	.name =			"wlip",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_wlip_scheduler.n_list),
+	.init_service =		ip_vs_wlip_init_svc,
+	.add_dest =		NULL,
+	.del_dest =		ip_vs_wlip_del_dest,
+	.schedule =		ip_vs_wlip_schedule,
+};
+
+static int __init ip_vs_wlip_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_wlip_scheduler);
+}
+
+static void __exit ip_vs_wlip_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wlip_scheduler);
+	synchronize_rcu();
+}
+
+module_init(ip_vs_wlip_init);
+module_exit(ip_vs_wlip_cleanup);
+MODULE_LICENSE("GPL");
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html