Hi, I wanted to shape and prioritize my outgoing connection and I wanted to use TBF + PRIO as it was all I needed. I understand that I could set up one class HTB and attach PRIO to it but HTB is a rather complex shaper and using it for such a simple task seems like an overkill for me. I stumbled on original Martin Devera's patch to add inner qdiscs to TBF and used the idea to make my own. The changes are: 1. Implement ufifo (unlimited fifo) qdisc to use as a default inner qdisc for TBF. Other qdiscs can be attached to TBF at a later time. 2. Have qdisc drop function return length of skb dropped so TBF could keep accurate statistics. The patch is against 2.5.72, I can rediff it if needed against a later version. Maybe it could be useful to others. Dmitry diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/include/net/pkt_sched.h linux-2.5.72/include/net/pkt_sched.h --- 2.5.72-vanilla/include/net/pkt_sched.h 2003-06-14 14:17:59.000000000 -0500 +++ linux-2.5.72/include/net/pkt_sched.h 2003-06-24 00:28:10.000000000 -0500 @@ -61,7 +61,7 @@ int (*enqueue)(struct sk_buff *, struct Qdisc *); struct sk_buff * (*dequeue)(struct Qdisc *); int (*requeue)(struct sk_buff *, struct Qdisc *); - int (*drop)(struct Qdisc *); + unsigned int (*drop)(struct Qdisc *); int (*init)(struct Qdisc *, struct rtattr *arg); void (*reset)(struct Qdisc *); @@ -416,6 +416,7 @@ extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; +extern struct Qdisc_ops ufifo_qdisc_ops; int register_qdisc(struct Qdisc_ops *qops); int unregister_qdisc(struct Qdisc_ops *qops); diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/netsyms.c linux-2.5.72/net/netsyms.c --- 2.5.72-vanilla/net/netsyms.c 2003-06-14 14:18:08.000000000 -0500 +++ linux-2.5.72/net/netsyms.c 2003-06-21 23:30:11.000000000 -0500 @@ -638,6 +638,7 @@ #ifdef CONFIG_NET_SCHED PSCHED_EXPORTLIST; EXPORT_SYMBOL(pfifo_qdisc_ops); +EXPORT_SYMBOL(ufifo_qdisc_ops); EXPORT_SYMBOL(register_qdisc); EXPORT_SYMBOL(unregister_qdisc); EXPORT_SYMBOL(qdisc_get_rtab); diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_atm.c linux-2.5.72/net/sched/sch_atm.c --- 2.5.72-vanilla/net/sched/sch_atm.c 2003-06-14 14:18:09.000000000 -0500 +++ linux-2.5.72/net/sched/sch_atm.c 2003-06-24 00:37:34.000000000 -0500 @@ -545,15 +545,16 @@ } -static int atm_tc_drop(struct Qdisc *sch) +static unsigned int atm_tc_drop(struct Qdisc *sch) { struct atm_qdisc_data *p = PRIV(sch); struct atm_flow_data *flow; + unsigned int len; DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); for (flow = p->flows; flow; flow = flow->next) - if (flow->q->ops->drop && flow->q->ops->drop(flow->q)) - return 1; + if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) + return len; return 0; } diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_cbq.c linux-2.5.72/net/sched/sch_cbq.c --- 2.5.72-vanilla/net/sched/sch_cbq.c 2003-06-14 14:17:58.000000000 -0500 +++ linux-2.5.72/net/sched/sch_cbq.c 2003-06-24 00:39:16.000000000 -0500 @@ -1231,11 +1231,12 @@ } } -static int cbq_drop(struct Qdisc* sch) +static unsigned int cbq_drop(struct Qdisc* sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct cbq_class *cl, *cl_head; int prio; + unsigned int len; for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { if ((cl_head = q->active[prio]) == NULL) @@ -1243,9 +1244,9 @@ cl = cl_head; do { - if (cl->q->ops->drop && cl->q->ops->drop(cl->q)) { + if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { sch->q.qlen--; - return 1; + return len; } } while ((cl = cl->next_alive) != cl_head); } diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_dsmark.c linux-2.5.72/net/sched/sch_dsmark.c --- 2.5.72-vanilla/net/sched/sch_dsmark.c 2003-06-14 14:17:59.000000000 -0500 +++ linux-2.5.72/net/sched/sch_dsmark.c 2003-06-24 00:40:39.000000000 -0500 @@ -302,17 +302,18 @@ } -static int dsmark_drop(struct Qdisc *sch) +static unsigned int dsmark_drop(struct Qdisc *sch) { struct dsmark_qdisc_data *p = PRIV(sch); - + unsigned int len; + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); if (!p->q->ops->drop) return 0; - if (!p->q->ops->drop(p->q)) + if (!(len = p->q->ops->drop(p->q))) return 0; sch->q.qlen--; - return 1; + return len; } diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_fifo.c linux-2.5.72/net/sched/sch_fifo.c --- 2.5.72-vanilla/net/sched/sch_fifo.c 2003-06-14 14:18:01.000000000 -0500 +++ linux-2.5.72/net/sched/sch_fifo.c 2003-06-24 00:45:28.000000000 -0500 @@ -81,16 +81,16 @@ return skb; } -static int -fifo_drop(struct Qdisc* sch) +static unsigned int fifo_drop(struct Qdisc* sch) { struct sk_buff *skb; skb = __skb_dequeue_tail(&sch->q); if (skb) { - sch->stats.backlog -= skb->len; + unsigned int len = skb->len; + sch->stats.backlog -= len; kfree_skb(skb); - return 1; + return len; } return 0; } @@ -135,6 +135,15 @@ return __skb_dequeue(&sch->q); } +static int +ufifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_tail(&sch->q, skb); + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; +} + static int fifo_init(struct Qdisc *sch, struct rtattr *opt) { struct fifo_sched_data *q = (void*)sch->data; @@ -202,3 +211,21 @@ .dump = fifo_dump, .owner = THIS_MODULE, }; + +struct Qdisc_ops ufifo_qdisc_ops = +{ + .next = NULL, + .cl_ops = NULL, + .id = "ufifo", + .priv_size = 0, + .enqueue = ufifo_enqueue, + .dequeue = pfifo_dequeue, + .requeue = pfifo_requeue, + .drop = fifo_drop, + .init = NULL, + .reset = fifo_reset, + .destroy = NULL, + .change = NULL, + .dump = NULL, + .owner = THIS_MODULE, +}; diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_gred.c linux-2.5.72/net/sched/sch_gred.c --- 2.5.72-vanilla/net/sched/sch_gred.c 2003-06-14 14:18:32.000000000 -0500 +++ linux-2.5.72/net/sched/sch_gred.c 2003-06-24 00:38:08.000000000 -0500 @@ -259,8 +259,7 @@ return NULL; } -static int -gred_drop(struct Qdisc* sch) +static unsigned int gred_drop(struct Qdisc* sch) { struct sk_buff *skb; @@ -269,20 +268,21 @@ skb = __skb_dequeue_tail(&sch->q); if (skb) { - sch->stats.backlog -= skb->len; + unsigned int len = skb->len; + sch->stats.backlog -= len; sch->stats.drops++; q= t->tab[(skb->tc_index&0xf)]; if (q) { - q->backlog -= skb->len; + q->backlog -= len; q->other++; if (!q->backlog && !t->eqp) PSCHED_GET_TIME(q->qidlestart); - } else { - D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); - } + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } kfree_skb(skb); - return 1; + return len; } q=t->tab[t->def]; diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_htb.c linux-2.5.72/net/sched/sch_htb.c --- 2.5.72-vanilla/net/sched/sch_htb.c 2003-06-14 14:18:24.000000000 -0500 +++ linux-2.5.72/net/sched/sch_htb.c 2003-06-24 00:42:19.000000000 -0500 @@ -1051,7 +1051,7 @@ } /* try to drop from each class (by prio) until one succeed */ -static int htb_drop(struct Qdisc* sch) +static unsigned int htb_drop(struct Qdisc* sch) { struct htb_sched *q = (struct htb_sched *)sch->data; int prio; @@ -1059,14 +1059,15 @@ for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { struct list_head *p; list_for_each (p,q->drops+prio) { - struct htb_class *cl = list_entry(p,struct htb_class, - un.leaf.drop_list); + struct htb_class *cl = list_entry(p, struct htb_class, + un.leaf.drop_list); + unsigned int len; if (cl->un.leaf.q->ops->drop && - cl->un.leaf.q->ops->drop(cl->un.leaf.q)) { + (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { sch->q.qlen--; if (!cl->un.leaf.q->q.qlen) htb_deactivate (q,cl); - return 1; + return len; } } } diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_ingress.c linux-2.5.72/net/sched/sch_ingress.c --- 2.5.72-vanilla/net/sched/sch_ingress.c 2003-06-14 14:18:29.000000000 -0500 +++ linux-2.5.72/net/sched/sch_ingress.c 2003-06-24 00:42:41.000000000 -0500 @@ -190,7 +190,7 @@ return 0; } -static int ingress_drop(struct Qdisc *sch) +static unsigned int ingress_drop(struct Qdisc *sch) { #ifdef DEBUG_INGRESS struct ingress_qdisc_data *p = PRIV(sch); diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_prio.c linux-2.5.72/net/sched/sch_prio.c --- 2.5.72-vanilla/net/sched/sch_prio.c 2003-06-14 14:18:08.000000000 -0500 +++ linux-2.5.72/net/sched/sch_prio.c 2003-06-24 00:29:37.000000000 -0500 @@ -124,18 +124,18 @@ } -static int -prio_drop(struct Qdisc* sch) +static unsigned int prio_drop(struct Qdisc* sch) { struct prio_sched_data *q = (struct prio_sched_data *)sch->data; int prio; + unsigned int len; struct Qdisc *qdisc; for (prio = q->bands-1; prio >= 0; prio--) { qdisc = q->queues[prio]; - if (qdisc->ops->drop(qdisc)) { + if ((len = qdisc->ops->drop(qdisc)) != 0) { sch->q.qlen--; - return 1; + return len; } } return 0; diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_red.c linux-2.5.72/net/sched/sch_red.c --- 2.5.72-vanilla/net/sched/sch_red.c 2003-06-14 14:18:50.000000000 -0500 +++ linux-2.5.72/net/sched/sch_red.c 2003-06-24 00:38:27.000000000 -0500 @@ -342,19 +342,19 @@ return NULL; } -static int -red_drop(struct Qdisc* sch) +static unsigned int red_drop(struct Qdisc* sch) { struct sk_buff *skb; struct red_sched_data *q = (struct red_sched_data *)sch->data; skb = __skb_dequeue_tail(&sch->q); if (skb) { - sch->stats.backlog -= skb->len; + unsigned int len = skb->len; + sch->stats.backlog -= len; sch->stats.drops++; q->st.other++; kfree_skb(skb); - return 1; + return len; } PSCHED_GET_TIME(q->qidlestart); return 0; diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_sfq.c linux-2.5.72/net/sched/sch_sfq.c --- 2.5.72-vanilla/net/sched/sch_sfq.c 2003-06-14 14:18:34.000000000 -0500 +++ linux-2.5.72/net/sched/sch_sfq.c 2003-06-24 00:44:14.000000000 -0500 @@ -209,11 +209,12 @@ sfq_link(q, x); } -static int sfq_drop(struct Qdisc *sch) +static unsigned int sfq_drop(struct Qdisc *sch) { struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; sfq_index d = q->max_depth; struct sk_buff *skb; + unsigned int len; /* Queue is full! Find the longest slot and drop a packet from it */ @@ -221,12 +222,13 @@ if (d > 1) { sfq_index x = q->dep[d+SFQ_DEPTH].next; skb = q->qs[x].prev; + len = skb->len; __skb_unlink(skb, &q->qs[x]); kfree_skb(skb); sfq_dec(q, x); sch->q.qlen--; sch->stats.drops++; - return 1; + return len; } if (d == 1) { @@ -235,13 +237,14 @@ q->next[q->tail] = q->next[d]; q->allot[q->next[d]] += q->quantum; skb = q->qs[d].prev; + len = skb->len; __skb_unlink(skb, &q->qs[d]); kfree_skb(skb); sfq_dec(q, d); sch->q.qlen--; q->ht[q->hash[d]] = SFQ_DEPTH; sch->stats.drops++; - return 1; + return len; } return 0; diff -urN --exclude-from=/usr/src/exclude 2.5.72-vanilla/net/sched/sch_tbf.c linux-2.5.72/net/sched/sch_tbf.c --- 2.5.72-vanilla/net/sched/sch_tbf.c 2003-06-14 14:17:57.000000000 -0500 +++ linux-2.5.72/net/sched/sch_tbf.c 2003-06-24 00:28:45.000000000 -0500 @@ -7,6 +7,8 @@ * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - + * original idea by Martin Devera * */ @@ -123,62 +125,64 @@ long ptokens; /* Current number of P tokens */ psched_time_t t_c; /* Time check-point */ struct timer_list wd_timer; /* Watchdog timer */ + struct Qdisc *qdisc; /* Inner qdisc, default - unlimited queue */ + struct sk_buff *skb; /* skb that we could not dequeue */ }; #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) -static int -tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + int ret; - if (skb->len > q->max_size) - goto drop; - __skb_queue_tail(&sch->q, skb); - if ((sch->stats.backlog += skb->len) <= q->limit) { - sch->stats.bytes += skb->len; - sch->stats.packets++; - return 0; - } - - /* Drop action: undo the things that we just did, - * i.e. make tail drop - */ - - __skb_unlink(skb, &sch->q); - sch->stats.backlog -= skb->len; - -drop: - sch->stats.drops++; + if (skb->len > q->max_size || sch->stats.backlog + skb->len <= q->limit) { + sch->stats.drops++; #ifdef CONFIG_NET_CLS_POLICE - if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) + if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) #endif - kfree_skb(skb); - return NET_XMIT_DROP; -} - -static int -tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) -{ - __skb_queue_head(&sch->q, skb); + kfree_skb(skb); + + return NET_XMIT_DROP; + } + + if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) { + sch->stats.drops++; + return ret; + } + + sch->q.qlen++; sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; return 0; } -static int -tbf_drop(struct Qdisc* sch) +static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) { - struct sk_buff *skb; + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + int ret; + + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { + sch->q.qlen++; + sch->stats.backlog += skb->len; + } + + return ret; +} - skb = __skb_dequeue_tail(&sch->q); - if (skb) { - sch->stats.backlog -= skb->len; +static unsigned int tbf_drop(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + unsigned int len; + + if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->stats.backlog -= len; sch->stats.drops++; - kfree_skb(skb); - return 1; } - return 0; + return len; } static void tbf_watchdog(unsigned long arg) @@ -189,13 +193,16 @@ netif_schedule(sch->dev); } -static struct sk_buff * -tbf_dequeue(struct Qdisc* sch) +static struct sk_buff *tbf_dequeue(struct Qdisc* sch) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; struct sk_buff *skb; - skb = __skb_dequeue(&sch->q); + /* See if we have a leftover packet from previous round, + if not then try to drain our queue. */ + skb = q->skb; + if (!skb) + skb = q->qdisc->dequeue(q->qdisc); if (skb) { psched_time_t now; @@ -218,10 +225,12 @@ toks -= L2T(q, skb->len); if ((toks|ptoks) >= 0) { + q->skb = NULL; q->t_c = now; q->tokens = toks; q->ptokens = ptoks; sch->stats.backlog -= skb->len; + sch->q.qlen--; sch->flags &= ~TCQ_F_THROTTLED; return skb; } @@ -245,20 +254,21 @@ This is the main idea of all FQ algorithms (cf. CSZ, HPFQ, HFSC) */ - __skb_queue_head(&sch->q, skb); - + + /* Save skb - we will dequeue it as soon as we get a chance */ + q->skb = skb; + sch->flags |= TCQ_F_THROTTLED; sch->stats.overlimits++; } return NULL; } - -static void -tbf_reset(struct Qdisc* sch) +static void tbf_reset(struct Qdisc* sch) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + qdisc_reset(q->qdisc); skb_queue_purge(&sch->q); sch->stats.backlog = 0; PSCHED_GET_TIME(q->t_c); @@ -276,6 +286,7 @@ struct tc_tbf_qopt *qopt; struct qdisc_rate_table *rtab = NULL; struct qdisc_rate_table *ptab = NULL; + struct Qdisc *child = NULL; int max_size,n; if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || @@ -308,8 +319,14 @@ } if (max_size < 0) goto done; + + if (q->qdisc == &noop_qdisc) { + if ((child = qdisc_create_dflt(sch->dev, &ufifo_qdisc_ops)) == NULL) + goto done; + } sch_tree_lock(sch); + if (child) q->qdisc = child; q->limit = qopt->limit; q->mtu = qopt->mtu; q->max_size = max_size; @@ -339,6 +356,8 @@ init_timer(&q->wd_timer); q->wd_timer.function = tbf_watchdog; q->wd_timer.data = (unsigned long)sch; + + q->qdisc = &noop_qdisc; return tbf_change(sch, opt); } @@ -353,6 +372,9 @@ qdisc_put_rtab(q->P_tab); if (q->R_tab) qdisc_put_rtab(q->R_tab); + + qdisc_destroy(q->qdisc); + q->qdisc = &noop_qdisc; } static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -383,9 +405,92 @@ return -1; } +static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct tbf_sched_data *q = (struct tbf_sched_data*)sch->data; + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = 1; + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = xchg(&q->qdisc, new); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + return q->qdisc; +} + +static unsigned long tbf_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void tbf_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int tbf_delete(struct Qdisc *sch, unsigned long arg) +{ + return -ENOSYS; +} + +static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, (unsigned long)q, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct Qdisc_class_ops tbf_class_ops = +{ + .graft = tbf_graft, + .leaf = tbf_leaf, + .get = tbf_get, + .put = tbf_put, + .change = tbf_change_class, + .delete = tbf_delete, + .walk = tbf_walk, + .dump = tbf_dump_class, +}; + struct Qdisc_ops tbf_qdisc_ops = { .next = NULL, - .cl_ops = NULL, + .cl_ops = &tbf_class_ops, .id = "tbf", .priv_size = sizeof(struct tbf_sched_data), .enqueue = tbf_enqueue, @@ -397,6 +502,7 @@ .destroy = tbf_destroy, .change = tbf_change, .dump = tbf_dump, + .owner = THIS_MODULE, }; - : send the line "unsubscribe linux-net" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html