On Tue, 20 Jan 2015, Julian Anastasov wrote: > > + (u64)dr * (u64)lwgt < (u64)lr * (u64)dwgt || [...] > > + (dr == lr && dwgt > lwgt)) { > > Above check is redundant. I accepted your feedback and applied it to the below, except for this item. I believe if dr and lr are zero (no traffic), we still want to choose the higher weight, thus a separate comparison is needed. Thanks, Chris From: Chris Caputo <ccaputo@xxxxxxx> IPVS wlib (Weighted Least Incoming Byterate) and wlip (Weighted Least Incoming Packetrate) schedulers, updated for 3.19-rc5. Signed-off-by: Chris Caputo <ccaputo@xxxxxxx> --- diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/Kconfig linux-3.19-rc5/net/netfilter/ipvs/Kconfig --- linux-3.19-rc5-stock/net/netfilter/ipvs/Kconfig 2015-01-18 06:02:20.000000000 +0000 +++ linux-3.19-rc5/net/netfilter/ipvs/Kconfig 2015-01-20 08:08:28.883080285 +0000 @@ -240,6 +240,26 @@ config IP_VS_NQ If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_WLIB + tristate "weighted least incoming byterate scheduling" + ---help--- + The weighted least incoming byterate scheduling algorithm directs + network connections to the server with the least incoming byterate + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLIP + tristate "weighted least incoming packetrate scheduling" + ---help--- + The weighted least incoming packetrate scheduling algorithm directs + network connections to the server with the least incoming packetrate + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + comment 'IPVS SH scheduler' config IP_VS_SH_TAB_BITS diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/Makefile linux-3.19-rc5/net/netfilter/ipvs/Makefile --- linux-3.19-rc5-stock/net/netfilter/ipvs/Makefile 2015-01-18 06:02:20.000000000 +0000 +++ linux-3.19-rc5/net/netfilter/ipvs/Makefile 2015-01-20 08:08:28.883080285 +0000 @@ -33,6 +33,8 @@ obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o +obj-$(CONFIG_IP_VS_WLIB) += ip_vs_wlib.o +obj-$(CONFIG_IP_VS_WLIP) += ip_vs_wlip.o # IPVS application helpers obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlib.c linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlib.c --- linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlib.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlib.c 2015-01-20 08:09:00.177816054 +0000 @@ -0,0 +1,166 @@ +/* IPVS: Weighted Least Incoming Byterate Scheduling module + * + * Authors: Chris Caputo <ccaputo@xxxxxxx> based on code by: + * + * Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx> + * Peter Kese <peter.kese@xxxxxx> + * Julian Anastasov <ja@xxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c. + * + */ + +/* The WLIB algorithm uses the results of the estimator's inbps + * calculations to determine which real server has the lowest incoming + * byterate. + * + * Real server weight is factored into the calculation. An example way to + * use this is if you have one server that can handle 100 Mbps of input and + * another that can handle 1 Gbps you could set the weights to be 100 and 1000 + * respectively. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +static int +ip_vs_wlib_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + +static int +ip_vs_wlib_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) +{ + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); + return 0; +} + +/* Weighted Least Incoming Byterate scheduling */ +static struct ip_vs_dest * +ip_vs_wlib_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct list_head *p; + struct ip_vs_dest *dest, *last, *least = NULL; + int pass = 0; + u64 dr, lr = -1; + u32 dwgt, lwgt = 0; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* We calculate the load of each dest server as follows: + * (dest inbps rate) / dest->weight + * + * The comparison of dr*lwght < lr*dwght is equivalent to that of + * dr/dwght < lr/lwght if every weight is larger than zero. + * + * A server with weight=0 is quiesced and will not receive any + * new connections. + * + * In case of inactivity, highest weight is winner. And if that still makes + * for a tie, round robin is used (which is why we remember our last + * starting location in the linked list). + */ + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + + do { + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + dwgt = (u32)atomic_read(&dest->weight); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + dwgt > 0) { + spin_lock(&dest->stats.lock); + /* estimator's scaling doesn't matter */ + dr = dest->stats.est.inbps; + spin_unlock(&dest->stats.lock); + + if (!least || + dr * lwgt < lr * dwgt || + (!dr && !lr && dwgt > lwgt)) { + least = dest; + lr = dr; + lwgt = dwgt; + } + } + + if (dest == last) + goto stop; + } + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + if (least) + svc->sched_data = &least->n_list; + + spin_unlock_bh(&svc->sched_lock); + + if (least) { + IP_VS_DBG_BUF(6, + "WLIB: server %s:%u activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight)); + } else { + ip_vs_scheduler_err(svc, "no destination available"); + } + + return least; +} + +static struct ip_vs_scheduler ip_vs_wlib_scheduler = { + .name = "wlib", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlib_scheduler.n_list), + .init_service = ip_vs_wlib_init_svc, + .add_dest = NULL, + .del_dest = ip_vs_wlib_del_dest, + .schedule = ip_vs_wlib_schedule, +}; + +static int __init ip_vs_wlib_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlib_scheduler); +} + +static void __exit ip_vs_wlib_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlib_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_wlib_init); +module_exit(ip_vs_wlib_cleanup); +MODULE_LICENSE("GPL"); diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlip.c linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlip.c --- linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlip.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlip.c 2015-01-20 08:09:07.456126624 +0000 @@ -0,0 +1,166 @@ +/* IPVS: Weighted Least Incoming Packetrate Scheduling module + * + * Authors: Chris Caputo <ccaputo@xxxxxxx> based on code by: + * + * Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx> + * Peter Kese <peter.kese@xxxxxx> + * Julian Anastasov <ja@xxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c. + * + */ + +/* The WLIP algorithm uses the results of the estimator's inpps + * calculations to determine which real server has the lowest incoming + * packetrate. + * + * Real server weight is factored into the calculation. An example way to + * use this is if you have one server that can handle 10 Kpps of input and + * another that can handle 100 Kpps you could set the weights to be 10 and 100 + * respectively. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +static int +ip_vs_wlip_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + +static int +ip_vs_wlip_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) +{ + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); + return 0; +} + +/* Weighted Least Incoming Packetrate scheduling */ +static struct ip_vs_dest * +ip_vs_wlip_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct list_head *p; + struct ip_vs_dest *dest, *last, *least = NULL; + int pass = 0; + u32 dr, lr = -1; + u32 dwgt, lwgt = 0; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* We calculate the load of each dest server as follows: + * (dest inpps rate) / dest->weight + * + * The comparison of dr*lwght < lr*dwght is equivalent to that of + * dr/dwght < lr/lwght if every weight is larger than zero. + * + * A server with weight=0 is quiesced and will not receive any + * new connections. + * + * In case of inactivity, highest weight is winner. And if that still makes + * for a tie, round robin is used (which is why we remember our last + * starting location in the linked list). + */ + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + + do { + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + dwgt = (u32)atomic_read(&dest->weight); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + dwgt > 0) { + spin_lock(&dest->stats.lock); + /* estimator's scaling doesn't matter */ + dr = dest->stats.est.inpps; + spin_unlock(&dest->stats.lock); + + if (!least || + (u64)dr * lwgt < (u64)lr * dwgt || + (!dr && !lr && dwgt > lwgt)) { + least = dest; + lr = dr; + lwgt = dwgt; + } + } + + if (dest == last) + goto stop; + } + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + if (least) + svc->sched_data = &least->n_list; + + spin_unlock_bh(&svc->sched_lock); + + if (least) { + IP_VS_DBG_BUF(6, + "WLIP: server %s:%u activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight)); + } else { + ip_vs_scheduler_err(svc, "no destination available"); + } + + return least; +} + +static struct ip_vs_scheduler ip_vs_wlip_scheduler = { + .name = "wlip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlip_scheduler.n_list), + .init_service = ip_vs_wlip_init_svc, + .add_dest = NULL, + .del_dest = ip_vs_wlip_del_dest, + .schedule = ip_vs_wlip_schedule, +}; + +static int __init ip_vs_wlip_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlip_scheduler); +} + +static void __exit ip_vs_wlip_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlip_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_wlip_init); +module_exit(ip_vs_wlip_cleanup); +MODULE_LICENSE("GPL"); -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html