Re: Re: Blue and SFB

Linux Advanced Routing and Traffic Control

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Nuutti Kotivuori wrote:
Patrick McHardy wrote:

I've completed the port and tested it yesterday, unfortunately it's
not useable in the real world as is. There is a strong bias against
non-ECN flows because their packets are simply dropped instead of
marked. At high load (50 ECN vs. 1 non-ECN flow) and a marking
probability of about 10% the non-ECN flow simply stalls.
I can send you the patch if you're interested ..


Thank you, I am really interested.

Patch is attached. Use the original tc patch.



I will try how it behaves for me in various circumstances.


What you say, though, is probably true - and the situation is even
more accentuated when considering that different TCP stacks react to
ECN and packet drops differently - a single drop percent will not be
enough.

OTOH, with only ECN flows it works great, not a single packet drop after a couple of minutes, without BLUE there were multiple thousands.

Which ofcourse brings us to SFB - with Stochastic Fair Blue, the drop
percentage for the non-ECN flow should be significantly lower and the
connections should transfer more or less fairly.

I'm going to check this out, don't know anything about it.


Regards
Patrick


-- Naked



# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/03/06 16:57:42+01:00 kaber@xxxxxxxxx 
#   Add BLUE scheduler
# 
# net/sched/sch_blue.c
#   2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +453 -0
# 
# net/sched/sch_blue.c
#   2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +0 -0
#   BitKeeper file /home/kaber/src/blue/linux-2.6/net/sched/sch_blue.c
# 
# net/sched/Makefile
#   2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +1 -0
#   Add BLUE scheduler
# 
# net/sched/Kconfig
#   2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +19 -0
#   Add BLUE scheduler
# 
# include/linux/pkt_sched.h
#   2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +31 -0
#   Add BLUE scheduler
# 
diff -Nru a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
--- a/include/linux/pkt_sched.h	Sat Mar  6 16:58:06 2004
+++ b/include/linux/pkt_sched.h	Sat Mar  6 16:58:06 2004
@@ -321,6 +321,37 @@
 	TCA_HFSC_MAX = TCA_HFSC_USC
 };
 
+/* BLUE section */
+
+enum
+{
+	TCA_BLUE_UNSPEC,
+	TCA_BLUE_PARMS,
+};
+
+struct tc_blue_qopt
+{
+	__u32	limit;		/* Limit on queue length, bytes */
+	__s32	freeze_time;	/* Minimum time between Pmark updates, usec! */
+	__s32	pmark_init;	/* Initial Pmark */
+	__s32	pmark_inc;	/* Pmark increment step */
+	__s32	pmark_dec;	/* Pmark decrement step */
+	__s32	pmark_max;	/* Pmark never exceeds this maximum */
+	__u32	flags;
+};
+
+/* Flags: */
+#define TC_BLUE_ECN	1	/* Do ECN marking, if ECT is set in the packet */
+
+struct tc_blue_xstats
+{
+	__s32	pmark;		/* Current Pmark */
+	__u32	marked;		/* Marked packets */
+	__u32	early_drops;	/* Early drops */
+	__u32	limit_drops;	/* Drops due to queue limits */
+	__u32	other_drops;	/* Drops due to drop() calls */
+};
+
 /* CBQ section */
 
 #define TC_CBQ_MAXPRIO		8
diff -Nru a/net/sched/Kconfig b/net/sched/Kconfig
--- a/net/sched/Kconfig	Sat Mar  6 16:58:06 2004
+++ b/net/sched/Kconfig	Sat Mar  6 16:58:06 2004
@@ -101,6 +101,25 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_red.
 
+config NET_SCH_BLUE
+	tristate "BLUE queue"
+	depends on NET_SCHED
+	help
+	  Say Y here if you want to use the BLUE queue management algorithm
+	  for some of your network devices. BLUE is similar to RED as it
+	  randomly ECN-marks or drops packets, but uses a different approach
+	  to select the packets to be marked/dropped, which is intended to
+	  perform better under high load.
+	  
+	  For details and references about BLUE see:
+
+	  - http://thefengs.com/wuchang/blue/
+	  - http://www.sch.bme.hu/~bartoki/projects/thesis/
+	  - The top of net/sched/sch_blue.c
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_blue.
+
 config NET_SCH_SFQ
 	tristate "SFQ queue"
 	depends on NET_SCHED
diff -Nru a/net/sched/Makefile b/net/sched/Makefile
--- a/net/sched/Makefile	Sat Mar  6 16:58:06 2004
+++ b/net/sched/Makefile	Sat Mar  6 16:58:06 2004
@@ -15,6 +15,7 @@
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_BLUE)	+= sch_blue.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
diff -Nru a/net/sched/sch_blue.c b/net/sched/sch_blue.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/net/sched/sch_blue.c	Sat Mar  6 16:58:06 2004
@@ -0,0 +1,453 @@
+/*
+ * net/sched/sch_blue.c	BLUE queue.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Author:	Bartok Istvan, <bartoki@xxxxxxxxxx> <stefan.a.bartok@xxxxxxxx>
+ * Credits:	Based on sch_red.c written by Alexey Kuznetsov and others
+ *
+ * Changelog:
+ *
+ * Bartoki, 2001-Apr-10 - Initial version.
+ * Bartoki, 2001-Apr-11	- Added displaying Pmark in the statistics.
+ * 			- Added some moduletesting code (not to be included
+ * 			  in the final distribution).
+ * Bartoki, 2001-Apr-24 - Added compile-time setting of algorithm variations.
+ * Bartoki, 2001-May-24 - Added initial Pmark to the parameters
+ * kaber,   2004-Mar-05 - Ported to 2.6, IPv6 ECN marking bug fixed,
+ * 			  style changes
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/pkt_sched.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+
+/*	The BLUE algorithm:
+	===================
+
+	Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
+	"Blue: A New Class of Active Queue Management Algorithms"
+	U. Michigan CSE-TR-387-99, April 1999.
+
+	http://thefengs.com/wuchang/work/CSE-TR-387-99.pdf
+	http://thefengs.com/wuchang/blue/
+
+	This file codes the basic version of the algorithm as written down
+	on Page.5, Fig.3 of the paper. (Without Qave calculation)
+
+	From the paper:
+
+	"BLUE maintains a single probability Pm, which it uses to mark
+	(or drop) packets when they are enqueued. If the queue is continually
+	dropping packets due to queue overflow, BLUE increments Pm, thus
+	increasing the rate at which it sends back congestion notification.
+	Conversely, if the queue becomes empty or if the link is idle, BLUE
+	decreases its marking probability. This effectively allows BLUE to
+	'learn' the correct rate it needs to send back congestion
+	notification."
+
+	Upon packet loss:	if ( (now-last_update) > freeze_time ) {
+					pm += increment;
+					last_update = now;
+				}
+
+	Upon link idle event:	if ( (now-last_update) > freeze_time ) {
+					pm -= decrement;
+					last_update = now;
+				}
+
+
+	Parameters, settable by user:
+	-----------------------------
+
+	limit		- BYTES - Limit on queue length: BLUE will not enqueue
+			new packets while its backlog is greater than limit.
+			Note that backlog can exceed limit when a packet
+			gets enqueued for a  backlog > (limit-MTU)  queue,
+			or when one gets requeued.
+
+	pmark_init	- see include/linux/pkt_sched.h
+	freeze_time
+	pmark_inc
+	pmark_dec
+	pmark_max
+	flags		- For now, only one flag:
+
+			'ecn' - if set, tries to ECN-mark packets instead of
+			early dropping (but drops if the packet is not
+			ECN-capable). As sch_red does, this module can ECN-mark
+			even if the kernel was compiled without ECN-support.
+
+
+	Implementation:
+	---------------
+
+	Fractional fixed point format is used to represent Pmark:
+
+	                   sign 1/2 1/8
+	                      | |   |
+	Float 0.625 will be:  0.1 0 1 0 0 0 0...[all 0s] = 0x50000000
+	                       |  |
+	            binary point  1/4
+
+	As only the [0.0, 1.0] range is used (1.0 is represented as
+	0x7FFFFFFF = ~0.9999999995 for i386) this is an easy way do detect
+	over/underflows.
+
+	The queue is declared empty/idle when the dequeue function is called
+	with an empty queue. Note that the network device is not neccessarily
+	idle at that moment, but we could not get any much further with
+	estimating when is the end of the transmission as the hardware or
+	the device driver can have an internal buffer of few packets (to enable
+	end-to-end transmits) anyway.
+
+	For further details on design and performance analysis see:
+	http://www.sch.bme.hu/~bartoki/projects/thesis/
+
+*/
+
+
+/* 1 - Single last_update
+ * 0 - Separated timestamps for increase and decrease
+ */
+#define	BLUE_SINGLE_UPDATE_TIME	0
+
+/* 1 - Try increase Pmark not just on a tail-drop event, but also when
+ *     an enqueue occurs to a more then half-filled queue
+ * 0 - Try increase Pmark only on tail-drop
+ */
+#define BLUE_INCREASE_ON_HALFQ	1
+
+
+struct blue_sched_data
+{
+	struct tc_blue_qopt	qopt;	/* Parameters */
+	struct tc_blue_xstats	xstats;	/* BLUE-specific statistics */
+
+	int		pmark;		/* Packet marking probability */
+
+#if BLUE_SINGLE_UPDATE_TIME
+	psched_time_t	last_update;	/* Timestamp of the last pmark update */
+#else
+	psched_time_t	last_inc;	/* Timestamp of the last pmark increase */
+	psched_time_t	last_dec;	/* Timestamp of the last pmark decrease */
+#endif
+
+};
+
+
+/*
+ * Increments pmark
+ * Assumes 0 <= pmark and 0 <= pmark_inc
+ */
+static inline void blue_inc(struct blue_sched_data *q)
+{
+	q->pmark += q->qopt.pmark_inc;
+
+	/* On overflow or exceeding max use pmark_max: */
+	if (q->pmark < 0 || q->pmark > q->qopt.pmark_max)
+		q->pmark = q->qopt.pmark_max;
+}
+
+/*
+ * Increments pmark, but only if ((now - last_update) >= freeze_time)
+ */
+static inline void blue_try_inc(struct blue_sched_data *q)
+{
+	psched_time_t now;
+	psched_tdiff_t diff;
+
+	PSCHED_GET_TIME(now);
+
+#if BLUE_SINGLE_UPDATE_TIME
+	diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0);
+#else
+	diff = PSCHED_TDIFF_SAFE(now, q->last_inc, q->qopt.freeze_time, 0);
+#endif
+
+	/*
+	 * The time difference is plain long, it will wrap often on 32-bit
+	 * architectures, so even if (last_update < now) is true, the diff
+	 * can be negative.
+	 *
+	 * NOTE: this code still won't increase when:
+	 *	now = last_update + k*MAX_LONG + d
+	 *	where 0 <= d < freeze_time
+	 */
+	if (diff >= q->qopt.freeze_time || diff < 0) {
+		blue_inc(q);
+#if BLUE_SINGLE_UPDATE_TIME
+		q->last_update = now;
+#else
+		q->last_inc = now;
+#endif
+	}
+}
+
+/*
+ * Decrements pmark
+ * Assumes 0 <= pmark and 0 <= pmark_dec
+ */
+static inline void blue_dec(struct blue_sched_data *q)
+{
+	q->pmark -= q->qopt.pmark_dec;
+
+	/* On underflow use 0 */
+	if (q->pmark < 0)
+		q->pmark = 0;
+}
+
+/*
+ * Decrements pmark, but only if ((now - last_update) >= freeze_time)
+ */
+static inline void blue_try_dec(struct blue_sched_data *q)
+{
+	psched_time_t now;
+	psched_tdiff_t diff;
+
+	PSCHED_GET_TIME(now);
+
+#if BLUE_SINGLE_UPDATE_TIME
+	diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0);
+#else
+	diff = PSCHED_TDIFF_SAFE(now, q->last_dec, q->qopt.freeze_time, 0);
+#endif
+
+	/*
+	 * For diff < 0 see the comment at the same place in blue_try_inc()
+	 */
+	if (diff >= q->qopt.freeze_time || diff < 0) {
+		blue_dec(q);
+#if BLUE_SINGLE_UPDATE_TIME
+		q->last_update = now;
+#else
+		q->last_dec = now;
+#endif
+	}
+}
+
+/*
+ * Tries to ECN-mark the packet.
+ * Returns:	1 - success: marked now, or was marked already
+ *		0 - not marked: ECT bit is bot set in the packet,
+ *		    or not an IP packet
+ */
+static int blue_ecn_mark(struct sk_buff *skb)
+{
+	if (skb->nh.raw + 20 > skb->tail)
+		return 0;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!INET_ECN_is_capable(skb->nh.iph->tos))
+			return 0;
+		if (INET_ECN_is_not_ce(skb->nh.iph->tos))
+			IP_ECN_set_ce(skb->nh.iph);
+		return 1;
+	case __constant_htons(ETH_P_IPV6):
+		if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
+			return 0;
+		IP6_ECN_set_ce(skb->nh.ipv6h);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int
+blue_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	
+	/* Try to mark or early drop with probability pmark: */
+	/* Operator < in the comparision allows true 0.0 and 1.0 probability */
+
+	if (((u32)net_random() >> 1) < q->pmark) {
+		goto mark;
+	} else {
+		goto enqueue;
+	}
+
+enqueue:
+	if (sch->stats.backlog <= q->qopt.limit) {
+
+#if BLUE_INCREASE_ON_HALFQ
+		if (sch->stats.backlog >= q->qopt.limit/2)
+			blue_try_inc(q);
+#endif
+
+		__skb_queue_tail(&sch->q, skb);
+		sch->stats.backlog += skb->len;
+		sch->stats.bytes += skb->len;
+		sch->stats.packets++;
+
+		return NET_XMIT_SUCCESS;
+	} else {
+		/* No space left, try to increment pmark and drop the packet */
+		blue_try_inc(q);
+		q->xstats.limit_drops++;
+		goto drop;
+	}
+
+mark:
+	sch->stats.overlimits++;
+	if ((q->qopt.flags & TC_BLUE_ECN) && blue_ecn_mark(skb)) {
+		q->xstats.marked++;
+		goto enqueue;
+	} else {
+		q->xstats.early_drops++;
+		goto drop;
+	}
+
+drop:
+	kfree_skb(skb);
+	sch->stats.drops++;
+	return NET_XMIT_CN;
+}
+
+static int
+blue_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	__skb_queue_head(&sch->q, skb);
+	sch->stats.backlog += skb->len;
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+blue_dequeue(struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&sch->q);
+	if (skb) {
+		sch->stats.backlog -= skb->len;
+		return skb;
+	} else {
+		/* Queue empty, try to decrement pmark */
+		blue_try_dec(q);
+		return NULL;
+	}
+}
+
+static unsigned int
+blue_drop(struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	skb = __skb_dequeue_tail(&sch->q);
+	if (skb) {
+		sch->stats.backlog -= skb->len;
+		sch->stats.drops++;
+		q->xstats.other_drops++;
+		len = skb->len;
+		kfree_skb(skb);
+		return len;
+	} else {
+		return 0;
+	}
+}
+
+static void
+blue_reset(struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+
+	__skb_queue_purge(&sch->q);
+	sch->stats.backlog = 0;
+	q->pmark = q->qopt.pmark_init;
+}
+
+static int
+blue_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	struct rtattr *tb[TCA_BLUE_PARMS];
+	struct tc_blue_qopt *ctl;
+
+	if (opt == NULL
+	    || rtattr_parse(tb, TCA_BLUE_PARMS, RTA_DATA(opt), RTA_PAYLOAD(opt))
+	    || tb[TCA_BLUE_PARMS-1] == 0
+	    || RTA_PAYLOAD(tb[TCA_BLUE_PARMS-1]) < sizeof(*ctl))
+		return -EINVAL;
+	ctl = RTA_DATA(tb[TCA_BLUE_PARMS-1]);
+
+	sch_tree_lock(sch);
+	q->qopt = *ctl;
+	q->pmark = q->qopt.pmark_init;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int blue_dump_xstats(struct blue_sched_data *q, struct sk_buff *skb)
+{
+	q->xstats.pmark = q->pmark;
+        RTA_PUT(skb, TCA_XSTATS, sizeof(q->xstats), &q->xstats);
+        return 0;
+
+rtattr_failure:
+        return 1;
+}
+
+static int
+blue_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	unsigned char *b = skb->tail;
+	struct rtattr *rta = (struct rtattr *)b;
+
+	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	RTA_PUT(skb, TCA_BLUE_PARMS, sizeof(q->qopt), &q->qopt);
+	rta->rta_len = skb->tail - b;
+
+	if (blue_dump_xstats(q, skb))
+		goto rtattr_failure;
+	return skb->len;
+
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+struct Qdisc_ops blue_qdisc_ops =
+{
+	.id		= "blue",
+	.init		= blue_init,
+	.change		= blue_init,
+	.reset		= blue_reset,
+	.dump		= blue_dump,
+	.enqueue	= blue_enqueue,
+	.dequeue	= blue_dequeue,
+	.requeue	= blue_requeue,
+	.drop		= blue_drop,
+	.priv_size	= sizeof(struct blue_sched_data),
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_blue(void)
+{
+	return register_qdisc(&blue_qdisc_ops);
+}
+
+static void __exit exit_blue(void)
+{
+	unregister_qdisc(&blue_qdisc_ops);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_blue);
+module_exit(exit_blue);

[Index of Archives]     [LARTC Home Page]     [Netfilter]     [Netfilter Development]     [Network Development]     [Bugtraq]     [GCC Help]     [Yosemite News]     [Linux Kernel]     [Fedora Users]
  Powered by Linux