On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote: Mainly suggestions to make the code easier to understand, but see the comment about the 1:1 queue/NetClientState model for a general issue with this approach. > Recently, linux support multiqueue tap which could let userspace call TUNSETIFF > for a signle device many times to create multiple file descriptors as s/signle/single/ (Noting these if you respin.) > independent queues. User could also enable/disabe a specific queue through s/disabe/disable/ > TUNSETQUEUE. > > The patch adds the generic infrastructure to create multiqueue taps. To achieve > this a new parameter "queues" were introduced to specify how many queues were > expected to be created for tap. The "fd" parameter were also changed to support > a list of file descriptors which could be used by management (such as libvirt) > to pass pre-created file descriptors (queues) to qemu. > > Each TAPState were still associated to a tap fd, which mean multiple TAPStates > were created when user needs multiqueue taps. > > Only linux part were implemented now, since it's the only OS that support > multiqueue tap. > > Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> > --- > net/tap-aix.c | 18 ++++- > net/tap-bsd.c | 18 ++++- > net/tap-haiku.c | 18 ++++- > net/tap-linux.c | 70 +++++++++++++++- > net/tap-linux.h | 4 + > net/tap-solaris.c | 18 ++++- > net/tap-win32.c | 10 ++ > net/tap.c | 248 +++++++++++++++++++++++++++++++++++++---------------- > net/tap.h | 8 ++- > qapi-schema.json | 5 +- > 10 files changed, 335 insertions(+), 82 deletions(-) This patch should be split up: 1. linux-headers: import linux/if_tun.h multiqueue constants 2. tap: add Linux multiqueue support (tap_open(), tap_fd_attach(), tap_fd_detach()) 3. tap: queue attach/detach (tap_attach(), tap_detach()) 4. tap: split out net_init_one_tap() function (pure code motion, to make later diffs easy to review) 5. tap: add "queues" and multi-"fd" options (net_init_tap()/net_init_one_tap() changes) Each commit description can explain how this works in more detail. I think I've figured it out now but it would have helped to separate things out from the start. > diff --git a/net/tap-aix.c b/net/tap-aix.c > index f27c177..f931ef3 100644 > --- a/net/tap-aix.c > +++ b/net/tap-aix.c > @@ -25,7 +25,8 @@ > #include "net/tap.h" > #include <stdio.h> > > -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) > +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, > + int vnet_hdr_required, int mq_required) > { > fprintf(stderr, "no tap on AIX\n"); > return -1; > @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, > int tso6, int ecn, int ufo) > { > } > + > +int tap_fd_attach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_detach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_ifname(int fd, char *ifname) > +{ > + return -1; > +} > diff --git a/net/tap-bsd.c b/net/tap-bsd.c > index a3b717d..07c287d 100644 > --- a/net/tap-bsd.c > +++ b/net/tap-bsd.c > @@ -33,7 +33,8 @@ > #include <net/if_tap.h> > #endif > > -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) > +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, > + int vnet_hdr_required, int mq_required) > { > int fd; > #ifdef TAPGIFNAME > @@ -145,3 +146,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, > int tso6, int ecn, int ufo) > { > } > + > +int tap_fd_attach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_detach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_ifname(int fd, char *ifname) > +{ > + return -1; > +} > diff --git a/net/tap-haiku.c b/net/tap-haiku.c > index 34739d1..62ab423 100644 > --- a/net/tap-haiku.c > +++ b/net/tap-haiku.c > @@ -25,7 +25,8 @@ > #include "net/tap.h" > #include <stdio.h> > > -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) > +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, > + int vnet_hdr_required, int mq_required) > { > fprintf(stderr, "no tap on Haiku\n"); > return -1; > @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, > int tso6, int ecn, int ufo) > { > } > + > +int tap_fd_attach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_detach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_ifname(int fd, char *ifname) > +{ > + return -1; > +} > diff --git a/net/tap-linux.c b/net/tap-linux.c > index c6521be..0854ef5 100644 > --- a/net/tap-linux.c > +++ b/net/tap-linux.c > @@ -35,7 +35,8 @@ > > #define PATH_NET_TUN "/dev/net/tun" > > -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) > +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, > + int vnet_hdr_required, int mq_required) > { > struct ifreq ifr; > int fd, ret; > @@ -67,6 +68,20 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required > } > } > > + if (mq_required) { > + unsigned int features; > + > + if ((ioctl(fd, TUNGETFEATURES, &features) != 0) || > + !(features & IFF_MULTI_QUEUE)) { > + error_report("multiqueue required, but no kernel " > + "support for IFF_MULTI_QUEUE available"); > + close(fd); > + return -1; > + } else { > + ifr.ifr_flags |= IFF_MULTI_QUEUE; > + } > + } > + > if (ifname[0] != '\0') > pstrcpy(ifr.ifr_name, IFNAMSIZ, ifname); > else > @@ -200,3 +215,56 @@ void tap_fd_set_offload(int fd, int csum, int tso4, > } > } > } > + > +/* Attach a file descriptor to a TUN/TAP device. This descriptor should be > + * detached before. > + */ > +int tap_fd_attach(int fd) > +{ > + struct ifreq ifr; > + int ret; > + > + memset(&ifr, 0, sizeof(ifr)); > + > + ifr.ifr_flags = IFF_ATTACH_QUEUE; > + ret = ioctl(fd, TUNSETQUEUE, (void *) &ifr); > + > + if (ret != 0) { > + error_report("could not attach fd to tap"); > + } > + > + return ret; > +} > + > +/* Detach a file descriptor to a TUN/TAP device. This file descriptor must have > + * been attach to a device. > + */ > +int tap_fd_detach(int fd) > +{ > + struct ifreq ifr; > + int ret; > + > + memset(&ifr, 0, sizeof(ifr)); > + > + ifr.ifr_flags = IFF_DETACH_QUEUE; > + ret = ioctl(fd, TUNSETQUEUE, (void *) &ifr); > + > + if (ret != 0) { > + error_report("could not detach fd"); > + } > + > + return ret; > +} > + > +int tap_get_ifname(int fd, char *ifname) Please document that ifname must have IFNAMSIZ size. > +{ > + struct ifreq ifr; > + > + if (ioctl(fd, TUNGETIFF, &ifr) != 0) { > + error_report("TUNGETIFF ioctl() failed: %s", strerror(errno)); > + return -1; > + } > + > + pstrcpy(ifname, sizeof(ifr.ifr_name), ifr.ifr_name); > + return 0; > +} > diff --git a/net/tap-linux.h b/net/tap-linux.h > index 659e981..648d29f 100644 > --- a/net/tap-linux.h > +++ b/net/tap-linux.h > @@ -29,6 +29,7 @@ > #define TUNSETSNDBUF _IOW('T', 212, int) > #define TUNGETVNETHDRSZ _IOR('T', 215, int) > #define TUNSETVNETHDRSZ _IOW('T', 216, int) > +#define TUNSETQUEUE _IOW('T', 217, int) > > #endif > > @@ -36,6 +37,9 @@ > #define IFF_TAP 0x0002 > #define IFF_NO_PI 0x1000 > #define IFF_VNET_HDR 0x4000 > +#define IFF_MULTI_QUEUE 0x0100 > +#define IFF_ATTACH_QUEUE 0x0200 > +#define IFF_DETACH_QUEUE 0x0400 > > /* Features for GSO (TUNSETOFFLOAD). */ > #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ > diff --git a/net/tap-solaris.c b/net/tap-solaris.c > index 5d6ac42..2df3ec1 100644 > --- a/net/tap-solaris.c > +++ b/net/tap-solaris.c > @@ -173,7 +173,8 @@ static int tap_alloc(char *dev, size_t dev_size) > return tap_fd; > } > > -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) > +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, > + int vnet_hdr_required, int mq_required) > { > char dev[10]=""; > int fd; > @@ -225,3 +226,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, > int tso6, int ecn, int ufo) > { > } > + > +int tap_fd_attach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_detach(int fd) > +{ > + return -1; > +} > + > +int tap_fd_ifname(int fd, char *ifname) > +{ > + return -1; > +} > diff --git a/net/tap-win32.c b/net/tap-win32.c > index f9bd741..d7b1f7a 100644 > --- a/net/tap-win32.c > +++ b/net/tap-win32.c > @@ -763,3 +763,13 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len) > { > assert(0); > } > + > +int tap_attach(NetClientState *nc) > +{ > + assert(0); > +} > + > +int tap_detach(NetClientState *nc) > +{ > + assert(0); > +} > diff --git a/net/tap.c b/net/tap.c > index 1abfd44..01f826a 100644 > --- a/net/tap.c > +++ b/net/tap.c > @@ -60,6 +60,7 @@ typedef struct TAPState { > unsigned int write_poll : 1; > unsigned int using_vnet_hdr : 1; > unsigned int has_ufo: 1; > + unsigned int enabled:1; For consistency, please use "enabled : 1". > VHostNetState *vhost_net; > unsigned host_vnet_hdr_len; > } TAPState; > @@ -73,9 +74,9 @@ static void tap_writable(void *opaque); > static void tap_update_fd_handler(TAPState *s) > { > qemu_set_fd_handler2(s->fd, > - s->read_poll ? tap_can_send : NULL, > - s->read_poll ? tap_send : NULL, > - s->write_poll ? tap_writable : NULL, > + s->read_poll && s->enabled ? tap_can_send : NULL, > + s->read_poll && s->enabled ? tap_send : NULL, > + s->write_poll && s->enabled ? tap_writable : NULL, > s); > } > > @@ -340,6 +341,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer, > s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; > s->using_vnet_hdr = 0; > s->has_ufo = tap_probe_has_ufo(s->fd); > + s->enabled = 1; > tap_set_offload(&s->nc, 0, 0, 0, 0, 0); > /* > * Make sure host header length is set correctly in tap: > @@ -559,17 +561,10 @@ int net_init_bridge(const NetClientOptions *opts, const char *name, > > static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, > const char *setup_script, char *ifname, > - size_t ifname_sz) > + size_t ifname_sz, int mq_required) > { > int fd, vnet_hdr_required; > > - if (tap->has_ifname) { > - pstrcpy(ifname, ifname_sz, tap->ifname); > - } else { > - assert(ifname_sz > 0); > - ifname[0] = '\0'; > - } > - > if (tap->has_vnet_hdr) { > *vnet_hdr = tap->vnet_hdr; > vnet_hdr_required = *vnet_hdr; > @@ -578,7 +573,8 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, > vnet_hdr_required = 0; > } > > - TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required)); > + TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required, > + mq_required)); > if (fd < 0) { > return -1; > } > @@ -594,69 +590,37 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, > return fd; > } > > -int net_init_tap(const NetClientOptions *opts, const char *name, > - NetClientState *peer) > -{ > - const NetdevTapOptions *tap; > - > - int fd, vnet_hdr = 0; > - const char *model; > - TAPState *s; > +#define MAX_TAP_QUEUES 1024 > > - /* for the no-fd, no-helper case */ > - const char *script = NULL; /* suppress wrong "uninit'd use" gcc warning */ > - char ifname[128]; > - > - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_TAP); > - tap = opts->tap; > - > - if (tap->has_fd) { > - if (tap->has_ifname || tap->has_script || tap->has_downscript || > - tap->has_vnet_hdr || tap->has_helper) { > - error_report("ifname=, script=, downscript=, vnet_hdr=, " > - "and helper= are invalid with fd="); > - return -1; > - } > - > - fd = monitor_handle_fd_param(cur_mon, tap->fd); > - if (fd == -1) { > - return -1; > - } > - > - fcntl(fd, F_SETFL, O_NONBLOCK); > - > - vnet_hdr = tap_probe_vnet_hdr(fd); > - > - model = "tap"; > - > - } else if (tap->has_helper) { > - if (tap->has_ifname || tap->has_script || tap->has_downscript || > - tap->has_vnet_hdr) { > - error_report("ifname=, script=, downscript=, and vnet_hdr= " > - "are invalid with helper="); > - return -1; > - } > - > - fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE); > - if (fd == -1) { > - return -1; > - } > +static int tap_fd(const StringList *fd, const char **fds) This function can be dropped if you change it so the "queues" parameter is not used together with "fd". There's no need to pass both: it simply adds more code to check they are consistent and is a pain for human users. Then you can iterate the StringList directly in __net_init_tap() without the needs for the temporary fds[] array. In other words: 1. For multiqueue without fd passing, use queues=<n>. 2. For multiqueue with fd passing, use fd=<fd>. > +{ > + const StringList *c = fd; > + size_t i = 0, num_opts = 0; > > - fcntl(fd, F_SETFL, O_NONBLOCK); > + while (c) { > + num_opts++; > + c = c->next; > + } > > - vnet_hdr = tap_probe_vnet_hdr(fd); > + if (num_opts == 0) { > + return 0; > + } > > - model = "bridge"; > + c = fd; > + while (c) { > + fds[i++] = c->value->str; > + c = c->next; > + } > > - } else { > - script = tap->has_script ? tap->script : DEFAULT_NETWORK_SCRIPT; > - fd = net_tap_init(tap, &vnet_hdr, script, ifname, sizeof ifname); > - if (fd == -1) { > - return -1; > - } > + return num_opts; > +} > > - model = "tap"; > - } > +static int __net_init_tap(const NetdevTapOptions *tap, NetClientState *peer, > + const char *model, const char *name, > + const char *ifname, const char *script, > + const char *downscript, int vnet_hdr, int fd) Function names starting with underscore are avoided in QEMU. According to the C standard these names are reserved. Please rename, how about net_init_one_tap()? > +{ > + TAPState *s; > > s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); Every queue has the same name so qemu_find_netdev() doesn't work anymore. I think we need snprintf(queue_name, "%s.%d", name, queue_index). The model where we have one NetClientState per queue has a few other issues. Maybe you have adressed these in later patches: 1. netdev_del doesn't work because it only deleted 1 queue! 2. set_link changes link up/down for a single queue only 3. info network output will show many more entries now - I doubt management tools like libvirt are prepared to handle this and they may show 1 network interface per queue now! I think it's very likely that this simple 1:1 queue/NetClientState model won't work without more changes. > if (!s) { > @@ -674,11 +638,6 @@ int net_init_tap(const NetClientOptions *opts, const char *name, > snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s", > tap->helper); > } else { > - const char *downscript; > - > - downscript = tap->has_downscript ? tap->downscript : > - DEFAULT_NETWORK_DOWN_SCRIPT; > - > snprintf(s->nc.info_str, sizeof(s->nc.info_str), > "ifname=%s,script=%s,downscript=%s", ifname, script, > downscript); > @@ -716,9 +675,150 @@ int net_init_tap(const NetClientOptions *opts, const char *name, > return 0; > } > > +int net_init_tap(const NetClientOptions *opts, const char *name, > + NetClientState *peer) > +{ > + const NetdevTapOptions *tap; > + const char *fds[MAX_TAP_QUEUES]; Not a good idea to duplicate a hard-coded value from the tun driver. I suggested how to get rid of fds[] above, that way the tun driver could change this limit in the future without requiring a QEMU change too. > + int fd, vnet_hdr = 0, i, queues; > + /* for the no-fd, no-helper case */ > + const char *script = NULL; /* suppress wrong "uninit'd use" gcc warning */ > + const char *downscript = NULL; > + char ifname[128]; > + > + assert(opts->kind == NET_CLIENT_OPTIONS_KIND_TAP); > + tap = opts->tap; > + queues = tap->has_queues ? tap->queues : 1; > + > + if (tap->has_fd) { > + if (tap->has_ifname || tap->has_script || tap->has_downscript || > + tap->has_vnet_hdr || tap->has_helper) { > + error_report("ifname=, script=, downscript=, vnet_hdr=, " > + "and helper= are invalid with fd="); Please add tap->has_queues here to prevent "queues" and "fd" from being used together. > + return -1; > + } > + > + if (queues != tap_fd(tap->fd, fds)) { > + error_report("the number of fds were not equal to queues"); > + return -1; > + } > + > + for (i = 0; i < queues; i++) { > + fd = monitor_handle_fd_param(cur_mon, fds[i]); > + if (fd == -1) { > + return -1; > + } > + > + fcntl(fd, F_SETFL, O_NONBLOCK); > + > + if (i == 0) { > + vnet_hdr = tap_probe_vnet_hdr(fd); > + } The paranoid thing to do is: if (i == 0) { vnet_hdr = tap_probe_vnet_hdr(fd); } else if (vnet_hdr != tap_probe_vnet_hdr(fd)) { error_report("vnet_hdr not consistent across given tap fds"); return -1; } > + > + if (__net_init_tap(tap, peer, "tap", name, ifname, > + script, downscript, vnet_hdr, fd)) { > + return -1; > + } > + } > + } else if (tap->has_helper) { > + if (tap->has_ifname || tap->has_script || tap->has_downscript || > + tap->has_vnet_hdr) { > + error_report("ifname=, script=, downscript=, and vnet_hdr= " > + "are invalid with helper="); > + return -1; > + } > + > + /* FIXME: correct ? */ > + for (i = 0; i < queues; i++) { > + fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE); The bridge helper doesn't support multiqueue tap devices (it doesn't use IFF_MULTI_QUEUE). Even if it did, SIOCBRADDIF would fail with EBUSY because the network interface has already been added to the bridge. It seems qemu-bridge-helper.c needs to be extended to support --queues. Right now this code is broken. > + if (fd == -1) { > + return -1; > + } > + > + fcntl(fd, F_SETFL, O_NONBLOCK); > + > + if (i == 0) { > + vnet_hdr = tap_probe_vnet_hdr(fd); > + } > + > + if (__net_init_tap(tap, peer, "bridge", name, ifname, > + script, downscript, vnet_hdr, fd)) { > + return -1; > + } > + } > + } else { > + script = tap->has_script ? tap->script : DEFAULT_NETWORK_SCRIPT; > + downscript = tap->has_downscript ? tap->downscript : > + DEFAULT_NETWORK_DOWN_SCRIPT; > + > + if (tap->has_ifname) { > + pstrcpy(ifname, sizeof ifname, tap->ifname); > + } else { > + ifname[0] = '\0'; > + } > + > + for (i = 0; i < queues; i++) { > + fd = net_tap_init(tap, &vnet_hdr, i >= 1 ? "no" : script, > + ifname, sizeof ifname, queues > 1); > + if (fd == -1) { > + return -1; > + } > + > + if (i == 0 && tap_get_ifname(fd, ifname) != 0) { > + error_report("could not get ifname"); > + return -1; > + } > + > + if (__net_init_tap(tap, peer, "tap", name, ifname, > + i >= 1 ? "no" : script, > + i >= 1 ? "no" : downscript, > + vnet_hdr, fd)) { > + return -1; > + } It's cleaner to avoid passing script/downscript into __net_init_tap() because the fd passing and helper cases don't use it. Move the nc.info_str setting code out of __net_init_tap(). Then the script/downscript arguments are unnecessary and we have fewer if statements checking for tap->has_fd, tap->has_helper, and else. > + } > + } > + > + return 0; > +} > + > VHostNetState *tap_get_vhost_net(NetClientState *nc) > { > TAPState *s = DO_UPCAST(TAPState, nc, nc); > assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); > return s->vhost_net; > } > + > +int tap_attach(NetClientState *nc) The tap_attach()/tap_detach() naming isn't obvious. I wouldn't be sure what these functions actually do. You called the variable "enabled" - how about tap_enable()/tap_disable()? (Even if you don't rename, please add a doc comment and make the s->enabled variable name consistent with the function naming.) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html