On 12/11/2012 11:09 AM, Michal Privoznik wrote: > These classes can borrow unused bandwidth. Basically, > only egress qdsics can have classes, therefore we can s/qdsic/qdisc/ > do this kind of traffic shaping only on host's outgoing, > that is domain's incoming traffic. > --- > src/lxc/lxc_process.c | 3 +- > src/network/bridge_driver.c | 3 +- > src/qemu/qemu_command.c | 3 +- > src/qemu/qemu_driver.c | 2 +- > src/util/virnetdevbandwidth.c | 93 +++++++++++++++++++++++++++++++++++++--- > src/util/virnetdevbandwidth.h | 4 +- > src/util/virnetdevmacvlan.c | 2 +- > 7 files changed, 97 insertions(+), 13 deletions(-) > > diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c > index 50c61c5..3e7fcb8 100644 > --- a/src/lxc/lxc_process.c > +++ b/src/lxc/lxc_process.c > @@ -341,7 +341,8 @@ static int virLXCProcessSetupInterfaceBridged(virConnectPtr conn, > goto cleanup; > > if (virNetDevBandwidthSet(net->ifname, > - virDomainNetGetActualBandwidth(net)) < 0) { > + virDomainNetGetActualBandwidth(net), > + false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > net->ifname); > diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c > index 00cffee..58f1d2e 100644 > --- a/src/network/bridge_driver.c > +++ b/src/network/bridge_driver.c > @@ -2284,7 +2284,8 @@ networkStartNetworkVirtual(struct network_driver *driver, > VIR_FORCE_CLOSE(tapfd); > } > > - if (virNetDevBandwidthSet(network->def->bridge, network->def->bandwidth) < 0) { > + if (virNetDevBandwidthSet(network->def->bridge, > + network->def->bandwidth, true) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > network->def->bridge); > diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c > index 9009bd2..e10eb09 100644 > --- a/src/qemu/qemu_command.c > +++ b/src/qemu/qemu_command.c > @@ -292,7 +292,8 @@ qemuNetworkIfaceConnect(virDomainDefPtr def, > > if (tapfd >= 0 && > virNetDevBandwidthSet(net->ifname, > - virDomainNetGetActualBandwidth(net)) < 0) { > + virDomainNetGetActualBandwidth(net), > + false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > net->ifname); > diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c > index d449579..e6ae3fd 100644 > --- a/src/qemu/qemu_driver.c > +++ b/src/qemu/qemu_driver.c > @@ -9034,7 +9034,7 @@ qemuDomainSetInterfaceParameters(virDomainPtr dom, > sizeof(*newBandwidth->out)); > } > > - if (virNetDevBandwidthSet(net->ifname, newBandwidth) < 0) { > + if (virNetDevBandwidthSet(net->ifname, newBandwidth, false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > device); > diff --git a/src/util/virnetdevbandwidth.c b/src/util/virnetdevbandwidth.c > index 49fc425..71c272e 100644 > --- a/src/util/virnetdevbandwidth.c > +++ b/src/util/virnetdevbandwidth.c > @@ -45,17 +45,21 @@ virNetDevBandwidthFree(virNetDevBandwidthPtr def) > * virNetDevBandwidthSet: > * @ifname: on which interface > * @bandwidth: rates to set (may be NULL) > + * @hierarchical_class: whether to create hierarchical class > * > * This function enables QoS on specified interface > * and set given traffic limits for both, incoming > * and outgoing traffic. Any previous setting get > - * overwritten. > + * overwritten. If @hierarchical_class is TRUE, create > + * hierarchical class. It is used to guarantee minimal > + * throughput ('floor' attribute in NIC). > * > * Return 0 on success, -1 otherwise. > */ > int > virNetDevBandwidthSet(const char *ifname, > - virNetDevBandwidthPtr bandwidth) > + virNetDevBandwidthPtr bandwidth, > + bool hierarchical_class) > { > int ret = -1; > virCommandPtr cmd = NULL; > @@ -71,7 +75,7 @@ virNetDevBandwidthSet(const char *ifname, > > virNetDevBandwidthClear(ifname); > > - if (bandwidth->in) { > + if (bandwidth->in && bandwidth->in->average) { > if (virAsprintf(&average, "%llukbps", bandwidth->in->average) < 0) > goto cleanup; > if (bandwidth->in->peak && > @@ -83,15 +87,89 @@ virNetDevBandwidthSet(const char *ifname, > > cmd = virCommandNew(TC); > virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "root", > - "handle", "1:", "htb", "default", "1", NULL); > + "handle", "1:", "htb", "default", > + hierarchical_class ? "2" : "1", NULL); > if (virCommandRun(cmd, NULL) < 0) > goto cleanup; > > + /* If we are creating a hierarchical class, all non guaranteed traffic > + * goes to the 1:2 class which will adjust 'rate' dynamically as NICs > + * with guaranteed throughput are plugged and unplugged. Class 1:1 > + * exists so we don't exceed the maximum limit for the network. For each > + * NIC with guaranteed throughput a separate classid will be created. > + * NB '1:' is just a shorter notation of '1:0'. > + * > + * To get a picture how this works: > + * > + * +-----+ +---------+ +-----------+ +-----------+ +-----+ > + * | | | qdisc | | class 1:1 | | class 1:2 | | | > + * | NIC | | def 1:2 | | rate | | rate | | sfq | > + * | | --> | | --> | peak | -+-> | peak | --> | | > + * +-----+ +---------+ +-----------+ | +-----------+ +-----+ > + * | > + * | +-----------+ +-----+ > + * | | class 1:3 | | | > + * | | rate | | sfq | > + * +-> | peak | --> | | > + * | +-----------+ +-----+ > + * ... > + * | +-----------+ +-----+ > + * | | class 1:n | | | > + * | | rate | | sfq | > + * +-> | peak | --> | | > + * +-----------+ +-----+ > + * > + * After the routing decision, when is it clear a packet is to be sent > + * via a particular NIC, it is sent to the root qdisc (queueing > + * discipline). In this case HTB (Hierarchical Token Bucket). It has > + * only one direct child class (with id 1:1) which shapes the overall > + * rate that is sent through the NIC. This class has at least one child > + * (1:2) which is meant for all non-privileged (non guaranteed) traffic > + * from all domains. Then, for each interface with guaranteed > + * throughput, a separate class (1:n) is created. Imagine a class is a > + * box. Whenever a packet ends up in a class it is stored in this box > + * until the kernel sends it, then it is removed from box. Packets are > + * placed into boxes based on rules (filters) - e.g. depending on > + * destination IP/MAC address. If there is no rule to be applied, the > + * root qdisc has a default where such packets go (1:2 in this case). > + * Packets come in over and over again and boxes get filled more and > + * more. Imagine that kernel sends packets just once a second. So it > + * starts to traverse through this tree. It starts with the root qdisc > + * and through 1:1 it gets to 1:2. It sends packets up to 1:2's 'rate'. > + * Then it moves to 1:3 and again sends packets up to 1:3's 'rate'. The > + * whole process is repeated until 1:n is processed. So now we have > + * ensured each class its guaranteed bandwidth. If the sum of sent data > + * doesn't exceed the 'rate' in 1:1 class, we can go further and send > + * more packets. The rest of available bandwidth is distributed to the > + * 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root > + * 'rate' limit is reached or there are no more packets to send, we stop > + * sending and wait another second. Each class has an SFQ qdisc which > + * shuffles packets in boxes stochastically, so one sender cannot > + * starve others. > + * > + * Therefore, whenever we want to plug in a new guaranteed interface, we > + * need to create a new class and adjust the 'rate' of the 1:2 class. > + * When unplugging we do the exact opposite - remove the associated > + * class, and adjust the 'rate'. > + * > + * This description is rather long, but it is still a good idea to read > + * it before you dig into the code. > + */ > + if (hierarchical_class) { > + virCommandFree(cmd); > + cmd = virCommandNew(TC); > + virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent", > + "1:", "classid", "1:1", "htb", "rate", average, > + "ceil", peak ? peak : average, NULL); > + if (virCommandRun(cmd, NULL) < 0) > + goto cleanup; > + } > virCommandFree(cmd); > cmd = virCommandNew(TC); > virCommandAddArgList(cmd,"class", "add", "dev", ifname, "parent", > - "1:", "classid", "1:1", "htb", NULL); > - virCommandAddArgList(cmd, "rate", average, NULL); > + hierarchical_class ? "1:1" : "1:", "classid", > + hierarchical_class ? "1:2" : "1:1", "htb", > + "rate", average, NULL); > > if (peak) > virCommandAddArgList(cmd, "ceil", peak, NULL); > @@ -104,7 +182,8 @@ virNetDevBandwidthSet(const char *ifname, > virCommandFree(cmd); > cmd = virCommandNew(TC); > virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "parent", > - "1:1", "handle", "2:", "sfq", "perturb", > + hierarchical_class ? "1:2" : "1:1", > + "handle", "2:", "sfq", "perturb", > "10", NULL); > > if (virCommandRun(cmd, NULL) < 0) > diff --git a/src/util/virnetdevbandwidth.h b/src/util/virnetdevbandwidth.h > index 35f8b89..d308ab2 100644 > --- a/src/util/virnetdevbandwidth.h > +++ b/src/util/virnetdevbandwidth.h > @@ -42,7 +42,9 @@ struct _virNetDevBandwidth { > > void virNetDevBandwidthFree(virNetDevBandwidthPtr def); > > -int virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthPtr bandwidth) > +int virNetDevBandwidthSet(const char *ifname, > + virNetDevBandwidthPtr bandwidth, > + bool hierarchical_class) > ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK; > int virNetDevBandwidthClear(const char *ifname) > ATTRIBUTE_NONNULL(1); > diff --git a/src/util/virnetdevmacvlan.c b/src/util/virnetdevmacvlan.c > index d8e646a..657c484 100644 > --- a/src/util/virnetdevmacvlan.c > +++ b/src/util/virnetdevmacvlan.c > @@ -925,7 +925,7 @@ create_name: > rc = 0; > } > > - if (virNetDevBandwidthSet(cr_ifname, bandwidth) < 0) { > + if (virNetDevBandwidthSet(cr_ifname, bandwidth, false) < 0) { > virReportError(VIR_ERR_INTERNAL_ERROR, > _("cannot set bandwidth limits on %s"), > cr_ifname); ACK. -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list