On 12/04/2012 02:18 PM, Michal Privoznik wrote: > These classes can borrow unused bandwidth. Basically, > only egress qdsics can have classes, therefore we can > do this kind of traffic shaping only on host's outgoing, > that is domain's incoming traffic. > --- > src/lxc/lxc_process.c | 3 +- > src/network/bridge_driver.c | 3 +- > src/qemu/qemu_command.c | 3 +- > src/qemu/qemu_driver.c | 2 +- > src/util/virnetdevbandwidth.c | 93 +++++++++++++++++++++++++++++++++++++--- > src/util/virnetdevbandwidth.h | 4 +- > src/util/virnetdevmacvlan.c | 2 +- > 7 files changed, 97 insertions(+), 13 deletions(-) > > diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c > index 8b9d02f..3937839 100644 > --- a/src/lxc/lxc_process.c > +++ b/src/lxc/lxc_process.c > @@ -337,7 +337,8 @@ static int virLXCProcessSetupInterfaceBridged(virConnectPtr conn, > goto cleanup; > > if (virNetDevBandwidthSet(net->ifname, > - virDomainNetGetActualBandwidth(net)) < 0) { > + virDomainNetGetActualBandwidth(net), > + false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > net->ifname); > diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c > index 75f3c3a..63be5e2 100644 > --- a/src/network/bridge_driver.c > +++ b/src/network/bridge_driver.c > @@ -2290,7 +2290,8 @@ networkStartNetworkVirtual(struct network_driver *driver, > VIR_FORCE_CLOSE(tapfd); > } > > - if (virNetDevBandwidthSet(network->def->bridge, network->def->bandwidth) < 0) { > + if (virNetDevBandwidthSet(network->def->bridge, > + network->def->bandwidth, true) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > network->def->bridge); > diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c > index 1805625..2b466dc 100644 > --- a/src/qemu/qemu_command.c > +++ b/src/qemu/qemu_command.c > @@ -292,7 +292,8 @@ qemuNetworkIfaceConnect(virDomainDefPtr def, > > if (tapfd >= 0 && > virNetDevBandwidthSet(net->ifname, > - virDomainNetGetActualBandwidth(net)) < 0) { > + virDomainNetGetActualBandwidth(net), > + false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > net->ifname); > diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c > index 8e838cd..259ac37 100644 > --- a/src/qemu/qemu_driver.c > +++ b/src/qemu/qemu_driver.c > @@ -8983,7 +8983,7 @@ qemuDomainSetInterfaceParameters(virDomainPtr dom, > sizeof(*newBandwidth->out)); > } > > - if (virNetDevBandwidthSet(net->ifname, newBandwidth) < 0) { > + if (virNetDevBandwidthSet(net->ifname, newBandwidth, false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > device); > diff --git a/src/util/virnetdevbandwidth.c b/src/util/virnetdevbandwidth.c > index 49fc425..71c272e 100644 > --- a/src/util/virnetdevbandwidth.c > +++ b/src/util/virnetdevbandwidth.c > @@ -45,17 +45,21 @@ virNetDevBandwidthFree(virNetDevBandwidthPtr def) > * virNetDevBandwidthSet: > * @ifname: on which interface > * @bandwidth: rates to set (may be NULL) > + * @hierarchical_class: whether to create hierarchical class > * > * This function enables QoS on specified interface > * and set given traffic limits for both, incoming > * and outgoing traffic. Any previous setting get > - * overwritten. > + * overwritten. If @hierarchical_class is TRUE, create > + * hierarchical class. It is used to guarantee minimal > + * throughput ('floor' attribute in NIC). > * > * Return 0 on success, -1 otherwise. > */ > int > virNetDevBandwidthSet(const char *ifname, > - virNetDevBandwidthPtr bandwidth) > + virNetDevBandwidthPtr bandwidth, > + bool hierarchical_class) > { > int ret = -1; > virCommandPtr cmd = NULL; > @@ -71,7 +75,7 @@ virNetDevBandwidthSet(const char *ifname, > > virNetDevBandwidthClear(ifname); > > - if (bandwidth->in) { > + if (bandwidth->in && bandwidth->in->average) { > if (virAsprintf(&average, "%llukbps", bandwidth->in->average) < 0) > goto cleanup; > if (bandwidth->in->peak && > @@ -83,15 +87,89 @@ virNetDevBandwidthSet(const char *ifname, > > cmd = virCommandNew(TC); > virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "root", > - "handle", "1:", "htb", "default", "1", NULL); > + "handle", "1:", "htb", "default", > + hierarchical_class ? "2" : "1", NULL); > if (virCommandRun(cmd, NULL) < 0) > goto cleanup; > > + /* If we are creating a hierarchical class, all non guaranteed traffic > + * goes to the 1:2 class which will adjust 'rate' dynamically as NICs > + * with guaranteed throughput are plugged and unplugged. Class 1:1 > + * exists so we don't exceed the maximum limit for the network. For each > + * NIC with guaranteed throughput a separate classid will be created. > + * NB '1:' is just a shorter notation of '1:0'. > + * > + * To get a picture how this works: > + * > + * +-----+ +---------+ +-----------+ +-----------+ +-----+ > + * | | | qdisc | | class 1:1 | | class 1:2 | | | > + * | NIC | | def 1:2 | | rate | | rate | | sfq | > + * | | --> | | --> | peak | -+-> | peak | --> | | > + * +-----+ +---------+ +-----------+ | +-----------+ +-----+ > + * | > + * | +-----------+ +-----+ > + * | | class 1:3 | | | > + * | | rate | | sfq | > + * +-> | peak | --> | | > + * | +-----------+ +-----+ > + * ... > + * | +-----------+ +-----+ > + * | | class 1:n | | | > + * | | rate | | sfq | > + * +-> | peak | --> | | > + * +-----------+ +-----+ > + * > + * After the routing decision, when is it clear a packet is to be sent > + * via a particular NIC, it is sent to the root qdisc (queueing > + * discipline). In this case HTB (Hierarchical Token Bucket). It has > + * only one direct child class (with id 1:1) which shapes the overall > + * rate that is sent through the NIC. This class has at least one child > + * (1:2) which is meant for all non-privileged (non guaranteed) traffic > + * from all domains. Then, for each interface with guaranteed > + * throughput, a separate class (1:n) is created. Imagine a class is a > + * box. Whenever a packet ends up in a class it is stored in this box > + * until the kernel sends it, then it is removed from box. Packets are > + * placed into boxes based on rules (filters) - e.g. depending on > + * destination IP/MAC address. If there is no rule to be applied, the > + * root qdisc has a default where such packets go (1:2 in this case). > + * Packets come in over and over again and boxes get filled more and > + * more. Imagine that kernel sends packets just once a second. So it > + * starts to traverse through this tree. It starts with the root qdisc > + * and through 1:1 it gets to 1:2. It sends packets up to 1:2's 'rate'. > + * Then it moves to 1:3 and again sends packets up to 1:3's 'rate'. The > + * whole process is repeated until 1:n is processed. So now we have > + * ensured each class its guaranteed bandwidth. If the sum of sent data > + * doesn't exceed the 'rate' in 1:1 class, we can go further and send > + * more packets. The rest of available bandwidth is distributed to the > + * 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root > + * 'rate' limit is reached or there are no more packets to send, we stop > + * sending and wait another second. Each class has an SFQ qdisc which > + * shuffles packets in boxes stochastically, so one sender cannot > + * starve others. > + * > + * Therefore, whenever we want to plug in a new guaranteed interface, we > + * need to create a new class and adjust the 'rate' of the 1:2 class. > + * When unplugging we do the exact opposite - remove the associated > + * class, and adjust the 'rate'. > + * > + * This description is rather long, but it is still a good idea to read > + * it before you dig into the code. > + */ > + if (hierarchical_class) { > + virCommandFree(cmd); > + cmd = virCommandNew(TC); > + virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent", > + "1:", "classid", "1:1", "htb", "rate", average, > + "ceil", peak ? peak : average, NULL); > + if (virCommandRun(cmd, NULL) < 0) > + goto cleanup; > + } One twist to this that I thought about - when you first upgrade to a version of libvirt that supports floor, you will most likely already have some networks and interfaces running that don't have the hierarchical class setup, so it won't work properly until you bring down all the domains and networks and restart them. I'm not sure what the best solution to this particular version of the live upgrade problem is, though, so I'm not going to let it stop me from giving an ACK. > virCommandFree(cmd); > cmd = virCommandNew(TC); > virCommandAddArgList(cmd,"class", "add", "dev", ifname, "parent", > - "1:", "classid", "1:1", "htb", NULL); > - virCommandAddArgList(cmd, "rate", average, NULL); > + hierarchical_class ? "1:1" : "1:", "classid", > + hierarchical_class ? "1:2" : "1:1", "htb", > + "rate", average, NULL); > > if (peak) > virCommandAddArgList(cmd, "ceil", peak, NULL); > @@ -104,7 +182,8 @@ virNetDevBandwidthSet(const char *ifname, > virCommandFree(cmd); > cmd = virCommandNew(TC); > virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "parent", > - "1:1", "handle", "2:", "sfq", "perturb", > + hierarchical_class ? "1:2" : "1:1", > + "handle", "2:", "sfq", "perturb", > "10", NULL); > > if (virCommandRun(cmd, NULL) < 0) > diff --git a/src/util/virnetdevbandwidth.h b/src/util/virnetdevbandwidth.h > index 35f8b89..d308ab2 100644 > --- a/src/util/virnetdevbandwidth.h > +++ b/src/util/virnetdevbandwidth.h > @@ -42,7 +42,9 @@ struct _virNetDevBandwidth { > > void virNetDevBandwidthFree(virNetDevBandwidthPtr def); > > -int virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthPtr bandwidth) > +int virNetDevBandwidthSet(const char *ifname, > + virNetDevBandwidthPtr bandwidth, > + bool hierarchical_class) > ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK; > int virNetDevBandwidthClear(const char *ifname) > ATTRIBUTE_NONNULL(1); > diff --git a/src/util/virnetdevmacvlan.c b/src/util/virnetdevmacvlan.c > index d8e646a..657c484 100644 > --- a/src/util/virnetdevmacvlan.c > +++ b/src/util/virnetdevmacvlan.c > @@ -925,7 +925,7 @@ create_name: > rc = 0; > } > > - if (virNetDevBandwidthSet(cr_ifname, bandwidth) < 0) { > + if (virNetDevBandwidthSet(cr_ifname, bandwidth, false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > cr_ifname); ACK (I'd already ACKed this one, pending answers to my questions). -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list