Patrick McHardy wrote:
I've completed the port and tested it yesterday, unfortunately it's not useable in the real world as is. There is a strong bias against non-ECN flows because their packets are simply dropped instead of marked. At high load (50 ECN vs. 1 non-ECN flow) and a marking probability of about 10% the non-ECN flow simply stalls. I can send you the patch if you're interested ..
Thank you, I am really interested.
Patch is attached. Use the original tc patch.
I will try how it behaves for me in various circumstances.
What you say, though, is probably true - and the situation is even more accentuated when considering that different TCP stacks react to ECN and packet drops differently - a single drop percent will not be enough.
OTOH, with only ECN flows it works great, not a single packet drop after a couple of minutes, without BLUE there were multiple thousands.
Which ofcourse brings us to SFB - with Stochastic Fair Blue, the drop percentage for the non-ECN flow should be significantly lower and the connections should transfer more or less fairly.
I'm going to check this out, don't know anything about it.
Regards Patrick
-- Naked
# This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2004/03/06 16:57:42+01:00 kaber@xxxxxxxxx # Add BLUE scheduler # # net/sched/sch_blue.c # 2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +453 -0 # # net/sched/sch_blue.c # 2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +0 -0 # BitKeeper file /home/kaber/src/blue/linux-2.6/net/sched/sch_blue.c # # net/sched/Makefile # 2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +1 -0 # Add BLUE scheduler # # net/sched/Kconfig # 2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +19 -0 # Add BLUE scheduler # # include/linux/pkt_sched.h # 2004/03/06 16:57:35+01:00 kaber@xxxxxxxxx +31 -0 # Add BLUE scheduler # diff -Nru a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h --- a/include/linux/pkt_sched.h Sat Mar 6 16:58:06 2004 +++ b/include/linux/pkt_sched.h Sat Mar 6 16:58:06 2004 @@ -321,6 +321,37 @@ TCA_HFSC_MAX = TCA_HFSC_USC }; +/* BLUE section */ + +enum +{ + TCA_BLUE_UNSPEC, + TCA_BLUE_PARMS, +}; + +struct tc_blue_qopt +{ + __u32 limit; /* Limit on queue length, bytes */ + __s32 freeze_time; /* Minimum time between Pmark updates, usec! */ + __s32 pmark_init; /* Initial Pmark */ + __s32 pmark_inc; /* Pmark increment step */ + __s32 pmark_dec; /* Pmark decrement step */ + __s32 pmark_max; /* Pmark never exceeds this maximum */ + __u32 flags; +}; + +/* Flags: */ +#define TC_BLUE_ECN 1 /* Do ECN marking, if ECT is set in the packet */ + +struct tc_blue_xstats +{ + __s32 pmark; /* Current Pmark */ + __u32 marked; /* Marked packets */ + __u32 early_drops; /* Early drops */ + __u32 limit_drops; /* Drops due to queue limits */ + __u32 other_drops; /* Drops due to drop() calls */ +}; + /* CBQ section */ #define TC_CBQ_MAXPRIO 8 diff -Nru a/net/sched/Kconfig b/net/sched/Kconfig --- a/net/sched/Kconfig Sat Mar 6 16:58:06 2004 +++ b/net/sched/Kconfig Sat Mar 6 16:58:06 2004 @@ -101,6 +101,25 @@ To compile this code as a module, choose M here: the module will be called sch_red. +config NET_SCH_BLUE + tristate "BLUE queue" + depends on NET_SCHED + help + Say Y here if you want to use the BLUE queue management algorithm + for some of your network devices. BLUE is similar to RED as it + randomly ECN-marks or drops packets, but uses a different approach + to select the packets to be marked/dropped, which is intended to + perform better under high load. + + For details and references about BLUE see: + + - http://thefengs.com/wuchang/blue/ + - http://www.sch.bme.hu/~bartoki/projects/thesis/ + - The top of net/sched/sch_blue.c + + To compile this code as a module, choose M here: the + module will be called sch_blue. + config NET_SCH_SFQ tristate "SFQ queue" depends on NET_SCHED diff -Nru a/net/sched/Makefile b/net/sched/Makefile --- a/net/sched/Makefile Sat Mar 6 16:58:06 2004 +++ b/net/sched/Makefile Sat Mar 6 16:58:06 2004 @@ -15,6 +15,7 @@ obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o +obj-$(CONFIG_NET_SCH_BLUE) += sch_blue.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o diff -Nru a/net/sched/sch_blue.c b/net/sched/sch_blue.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/net/sched/sch_blue.c Sat Mar 6 16:58:06 2004 @@ -0,0 +1,453 @@ +/* + * net/sched/sch_blue.c BLUE queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Author: Bartok Istvan, <bartoki@xxxxxxxxxx> <stefan.a.bartok@xxxxxxxx> + * Credits: Based on sch_red.c written by Alexey Kuznetsov and others + * + * Changelog: + * + * Bartoki, 2001-Apr-10 - Initial version. + * Bartoki, 2001-Apr-11 - Added displaying Pmark in the statistics. + * - Added some moduletesting code (not to be included + * in the final distribution). + * Bartoki, 2001-Apr-24 - Added compile-time setting of algorithm variations. + * Bartoki, 2001-May-24 - Added initial Pmark to the parameters + * kaber, 2004-Mar-05 - Ported to 2.6, IPv6 ECN marking bug fixed, + * style changes + */ + +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/pkt_sched.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/ip.h> + +/* The BLUE algorithm: + =================== + + Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin + "Blue: A New Class of Active Queue Management Algorithms" + U. Michigan CSE-TR-387-99, April 1999. + + http://thefengs.com/wuchang/work/CSE-TR-387-99.pdf + http://thefengs.com/wuchang/blue/ + + This file codes the basic version of the algorithm as written down + on Page.5, Fig.3 of the paper. (Without Qave calculation) + + From the paper: + + "BLUE maintains a single probability Pm, which it uses to mark + (or drop) packets when they are enqueued. If the queue is continually + dropping packets due to queue overflow, BLUE increments Pm, thus + increasing the rate at which it sends back congestion notification. + Conversely, if the queue becomes empty or if the link is idle, BLUE + decreases its marking probability. This effectively allows BLUE to + 'learn' the correct rate it needs to send back congestion + notification." + + Upon packet loss: if ( (now-last_update) > freeze_time ) { + pm += increment; + last_update = now; + } + + Upon link idle event: if ( (now-last_update) > freeze_time ) { + pm -= decrement; + last_update = now; + } + + + Parameters, settable by user: + ----------------------------- + + limit - BYTES - Limit on queue length: BLUE will not enqueue + new packets while its backlog is greater than limit. + Note that backlog can exceed limit when a packet + gets enqueued for a backlog > (limit-MTU) queue, + or when one gets requeued. + + pmark_init - see include/linux/pkt_sched.h + freeze_time + pmark_inc + pmark_dec + pmark_max + flags - For now, only one flag: + + 'ecn' - if set, tries to ECN-mark packets instead of + early dropping (but drops if the packet is not + ECN-capable). As sch_red does, this module can ECN-mark + even if the kernel was compiled without ECN-support. + + + Implementation: + --------------- + + Fractional fixed point format is used to represent Pmark: + + sign 1/2 1/8 + | | | + Float 0.625 will be: 0.1 0 1 0 0 0 0...[all 0s] = 0x50000000 + | | + binary point 1/4 + + As only the [0.0, 1.0] range is used (1.0 is represented as + 0x7FFFFFFF = ~0.9999999995 for i386) this is an easy way do detect + over/underflows. + + The queue is declared empty/idle when the dequeue function is called + with an empty queue. Note that the network device is not neccessarily + idle at that moment, but we could not get any much further with + estimating when is the end of the transmission as the hardware or + the device driver can have an internal buffer of few packets (to enable + end-to-end transmits) anyway. + + For further details on design and performance analysis see: + http://www.sch.bme.hu/~bartoki/projects/thesis/ + +*/ + + +/* 1 - Single last_update + * 0 - Separated timestamps for increase and decrease + */ +#define BLUE_SINGLE_UPDATE_TIME 0 + +/* 1 - Try increase Pmark not just on a tail-drop event, but also when + * an enqueue occurs to a more then half-filled queue + * 0 - Try increase Pmark only on tail-drop + */ +#define BLUE_INCREASE_ON_HALFQ 1 + + +struct blue_sched_data +{ + struct tc_blue_qopt qopt; /* Parameters */ + struct tc_blue_xstats xstats; /* BLUE-specific statistics */ + + int pmark; /* Packet marking probability */ + +#if BLUE_SINGLE_UPDATE_TIME + psched_time_t last_update; /* Timestamp of the last pmark update */ +#else + psched_time_t last_inc; /* Timestamp of the last pmark increase */ + psched_time_t last_dec; /* Timestamp of the last pmark decrease */ +#endif + +}; + + +/* + * Increments pmark + * Assumes 0 <= pmark and 0 <= pmark_inc + */ +static inline void blue_inc(struct blue_sched_data *q) +{ + q->pmark += q->qopt.pmark_inc; + + /* On overflow or exceeding max use pmark_max: */ + if (q->pmark < 0 || q->pmark > q->qopt.pmark_max) + q->pmark = q->qopt.pmark_max; +} + +/* + * Increments pmark, but only if ((now - last_update) >= freeze_time) + */ +static inline void blue_try_inc(struct blue_sched_data *q) +{ + psched_time_t now; + psched_tdiff_t diff; + + PSCHED_GET_TIME(now); + +#if BLUE_SINGLE_UPDATE_TIME + diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0); +#else + diff = PSCHED_TDIFF_SAFE(now, q->last_inc, q->qopt.freeze_time, 0); +#endif + + /* + * The time difference is plain long, it will wrap often on 32-bit + * architectures, so even if (last_update < now) is true, the diff + * can be negative. + * + * NOTE: this code still won't increase when: + * now = last_update + k*MAX_LONG + d + * where 0 <= d < freeze_time + */ + if (diff >= q->qopt.freeze_time || diff < 0) { + blue_inc(q); +#if BLUE_SINGLE_UPDATE_TIME + q->last_update = now; +#else + q->last_inc = now; +#endif + } +} + +/* + * Decrements pmark + * Assumes 0 <= pmark and 0 <= pmark_dec + */ +static inline void blue_dec(struct blue_sched_data *q) +{ + q->pmark -= q->qopt.pmark_dec; + + /* On underflow use 0 */ + if (q->pmark < 0) + q->pmark = 0; +} + +/* + * Decrements pmark, but only if ((now - last_update) >= freeze_time) + */ +static inline void blue_try_dec(struct blue_sched_data *q) +{ + psched_time_t now; + psched_tdiff_t diff; + + PSCHED_GET_TIME(now); + +#if BLUE_SINGLE_UPDATE_TIME + diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0); +#else + diff = PSCHED_TDIFF_SAFE(now, q->last_dec, q->qopt.freeze_time, 0); +#endif + + /* + * For diff < 0 see the comment at the same place in blue_try_inc() + */ + if (diff >= q->qopt.freeze_time || diff < 0) { + blue_dec(q); +#if BLUE_SINGLE_UPDATE_TIME + q->last_update = now; +#else + q->last_dec = now; +#endif + } +} + +/* + * Tries to ECN-mark the packet. + * Returns: 1 - success: marked now, or was marked already + * 0 - not marked: ECT bit is bot set in the packet, + * or not an IP packet + */ +static int blue_ecn_mark(struct sk_buff *skb) +{ + if (skb->nh.raw + 20 > skb->tail) + return 0; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (!INET_ECN_is_capable(skb->nh.iph->tos)) + return 0; + if (INET_ECN_is_not_ce(skb->nh.iph->tos)) + IP_ECN_set_ce(skb->nh.iph); + return 1; + case __constant_htons(ETH_P_IPV6): + if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h))) + return 0; + IP6_ECN_set_ce(skb->nh.ipv6h); + return 1; + default: + return 0; + } +} + +static int +blue_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct blue_sched_data *q = (struct blue_sched_data *)sch->data; + + /* Try to mark or early drop with probability pmark: */ + /* Operator < in the comparision allows true 0.0 and 1.0 probability */ + + if (((u32)net_random() >> 1) < q->pmark) { + goto mark; + } else { + goto enqueue; + } + +enqueue: + if (sch->stats.backlog <= q->qopt.limit) { + +#if BLUE_INCREASE_ON_HALFQ + if (sch->stats.backlog >= q->qopt.limit/2) + blue_try_inc(q); +#endif + + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; + + return NET_XMIT_SUCCESS; + } else { + /* No space left, try to increment pmark and drop the packet */ + blue_try_inc(q); + q->xstats.limit_drops++; + goto drop; + } + +mark: + sch->stats.overlimits++; + if ((q->qopt.flags & TC_BLUE_ECN) && blue_ecn_mark(skb)) { + q->xstats.marked++; + goto enqueue; + } else { + q->xstats.early_drops++; + goto drop; + } + +drop: + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_CN; +} + +static int +blue_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return NET_XMIT_SUCCESS; +} + +static struct sk_buff * +blue_dequeue(struct Qdisc *sch) +{ + struct blue_sched_data *q = (struct blue_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + return skb; + } else { + /* Queue empty, try to decrement pmark */ + blue_try_dec(q); + return NULL; + } +} + +static unsigned int +blue_drop(struct Qdisc *sch) +{ + struct blue_sched_data *q = (struct blue_sched_data *)sch->data; + struct sk_buff *skb; + unsigned int len; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; + q->xstats.other_drops++; + len = skb->len; + kfree_skb(skb); + return len; + } else { + return 0; + } +} + +static void +blue_reset(struct Qdisc *sch) +{ + struct blue_sched_data *q = (struct blue_sched_data *)sch->data; + + __skb_queue_purge(&sch->q); + sch->stats.backlog = 0; + q->pmark = q->qopt.pmark_init; +} + +static int +blue_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct blue_sched_data *q = (struct blue_sched_data *)sch->data; + struct rtattr *tb[TCA_BLUE_PARMS]; + struct tc_blue_qopt *ctl; + + if (opt == NULL + || rtattr_parse(tb, TCA_BLUE_PARMS, RTA_DATA(opt), RTA_PAYLOAD(opt)) + || tb[TCA_BLUE_PARMS-1] == 0 + || RTA_PAYLOAD(tb[TCA_BLUE_PARMS-1]) < sizeof(*ctl)) + return -EINVAL; + ctl = RTA_DATA(tb[TCA_BLUE_PARMS-1]); + + sch_tree_lock(sch); + q->qopt = *ctl; + q->pmark = q->qopt.pmark_init; + sch_tree_unlock(sch); + return 0; +} + +static int blue_dump_xstats(struct blue_sched_data *q, struct sk_buff *skb) +{ + q->xstats.pmark = q->pmark; + RTA_PUT(skb, TCA_XSTATS, sizeof(q->xstats), &q->xstats); + return 0; + +rtattr_failure: + return 1; +} + +static int +blue_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct blue_sched_data *q = (struct blue_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *)b; + + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + RTA_PUT(skb, TCA_BLUE_PARMS, sizeof(q->qopt), &q->qopt); + rta->rta_len = skb->tail - b; + + if (blue_dump_xstats(q, skb)) + goto rtattr_failure; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct Qdisc_ops blue_qdisc_ops = +{ + .id = "blue", + .init = blue_init, + .change = blue_init, + .reset = blue_reset, + .dump = blue_dump, + .enqueue = blue_enqueue, + .dequeue = blue_dequeue, + .requeue = blue_requeue, + .drop = blue_drop, + .priv_size = sizeof(struct blue_sched_data), + .owner = THIS_MODULE, +}; + +static int __init init_blue(void) +{ + return register_qdisc(&blue_qdisc_ops); +} + +static void __exit exit_blue(void) +{ + unregister_qdisc(&blue_qdisc_ops); +} + +MODULE_LICENSE("GPL"); +module_init(init_blue); +module_exit(exit_blue);