This patch adds raw socket backend to qemu and is based on Or Gerlitz's patch re-factored and ported to the latest qemu-kvm git tree. It also includes support for vnet_hdr option that enables gso/checksum offload with raw backend. You can find the linux kernel patch to support this feature here. http://thread.gmane.org/gmane.linux.network/150308 Signed-off-by: Sridhar Samudrala <sri@xxxxxxxxxx> diff --git a/Makefile.objs b/Makefile.objs index 357d305..4468124 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -34,6 +34,8 @@ net-nested-$(CONFIG_SOLARIS) += tap-solaris.o net-nested-$(CONFIG_AIX) += tap-aix.o net-nested-$(CONFIG_SLIRP) += slirp.o net-nested-$(CONFIG_VDE) += vde.o +net-nested-$(CONFIG_POSIX) += raw.o +net-nested-$(CONFIG_LINUX) += raw-linux.o net-obj-y += $(addprefix net/, $(net-nested-y)) ###################################################################### diff --git a/hw/virtio-net.c b/hw/virtio-net.c index eba578a..4aa40f2 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -15,6 +15,7 @@ #include "net.h" #include "net/checksum.h" #include "net/tap.h" +#include "net/raw.h" #include "qemu-timer.h" #include "virtio-net.h" @@ -133,6 +134,9 @@ static int peer_has_vnet_hdr(VirtIONet *n) case NET_CLIENT_TYPE_TAP: n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer); break; + case NET_CLIENT_TYPE_RAW: + n->has_vnet_hdr = raw_has_vnet_hdr(n->nic->nc.peer); + break; default: return 0; } @@ -149,6 +153,9 @@ static int peer_has_ufo(VirtIONet *n) case NET_CLIENT_TYPE_TAP: n->has_ufo = tap_has_ufo(n->nic->nc.peer); break; + case NET_CLIENT_TYPE_RAW: + n->has_ufo = raw_has_ufo(n->nic->nc.peer); + break; default: return 0; } @@ -165,6 +172,9 @@ static void peer_using_vnet_hdr(VirtIONet *n, int using_vnet_hdr) case NET_CLIENT_TYPE_TAP: tap_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr); break; + case NET_CLIENT_TYPE_RAW: + raw_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr); + break; default: break; } @@ -180,6 +190,9 @@ static void peer_set_offload(VirtIONet *n, int csum, int tso4, int tso6, case NET_CLIENT_TYPE_TAP: tap_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo); break; + case NET_CLIENT_TYPE_RAW: + raw_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo); + break; default: break; } diff --git a/net.c b/net.c index 6ef93e6..1ca2415 100644 --- a/net.c +++ b/net.c @@ -26,6 +26,7 @@ #include "config-host.h" #include "net/tap.h" +#include "net/raw.h" #include "net/socket.h" #include "net/dump.h" #include "net/slirp.h" @@ -1004,6 +1005,27 @@ static struct { }, { /* end of list */ } }, + }, { + .type = "raw", + .init = net_init_raw, + .desc = { + NET_COMMON_PARAMS_DESC, + { + .name = "fd", + .type = QEMU_OPT_STRING, + .help = "file descriptor of an already opened raw socket", + }, { + .name = "ifname", + .type = QEMU_OPT_STRING, + .help = "interface name", + }, { + .name = "vnet_hdr", + .type = QEMU_OPT_BOOL, + .help = "enable PACKET_VNET_HDR option on the raw interface" + }, + { /* end of list */ } + }, + #ifdef CONFIG_VDE }, { .type = "vde", @@ -1076,6 +1098,7 @@ int net_client_init(Monitor *mon, QemuOpts *opts, int is_netdev) #ifdef CONFIG_VDE strcmp(type, "vde") != 0 && #endif + strcmp(type, "raw") != 0 && strcmp(type, "socket") != 0) { qemu_error("The '%s' network backend type is not valid with -netdev\n", type); diff --git a/net.h b/net.h index 116bb80..4722185 100644 --- a/net.h +++ b/net.h @@ -34,7 +34,8 @@ typedef enum { NET_CLIENT_TYPE_TAP, NET_CLIENT_TYPE_SOCKET, NET_CLIENT_TYPE_VDE, - NET_CLIENT_TYPE_DUMP + NET_CLIENT_TYPE_DUMP, + NET_CLIENT_TYPE_RAW, } net_client_type; typedef void (NetPoll)(VLANClientState *, bool enable); diff --git a/net/raw-linux.c b/net/raw-linux.c new file mode 100644 index 0000000..9ed2e6a --- /dev/null +++ b/net/raw-linux.c @@ -0,0 +1,97 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "net/raw.h" +#include "net/raw-linux.h" + +#include <netpacket/packet.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <netinet/in.h> + +#include "sysemu.h" +#include "qemu-common.h" + +int raw_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +{ + struct ifreq req; + int fd, ret; + struct sockaddr_ll lladdr; + int val; + + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (fd < 0) + fprintf(stderr, "packet socket failed\n"); + + memset(&req, 0, sizeof(req)); + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); + ret = ioctl(fd, SIOCGIFINDEX, &req); + if (ret < 0) + fprintf(stderr, "SIOCGIFINDEX failed\n"); + + memset(&lladdr, 0, sizeof(lladdr)); + lladdr.sll_family = AF_PACKET; + lladdr.sll_protocol = htons(ETH_P_ALL); + lladdr.sll_ifindex = req.ifr_ifindex; + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); + if (ret < 0) + fprintf(stderr, "bind failed\n"); + + if (*vnet_hdr) { + val = 1; + ret=setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, (const char *)&val, + sizeof(val)); + if (ret < 0) { + fprintf(stderr, "setsockopt(SOL_PACKET, PACKET_VNET_HDR) failed\n"); + *vnet_hdr = 0; + } else { + *vnet_hdr = 1; + } + + if (vnet_hdr_required && !*vnet_hdr) { + qemu_error("vnet_hdr=1 requested, but kernel " + "doesn't support PACKET_VNET_HDR"); + close(fd); + return -1; + } + } + + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); + if (ret < 0) + fprintf(stderr, "O_NONBLOCK set failed\n"); + + return fd; +} + +int raw_probe_vnet_hdr(int fd) +{ + int val, len; + + len = sizeof(val); + if (getsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &val, (socklen_t *)&len) == 0) + return 1; + + return 0; +} diff --git a/net/raw-linux.h b/net/raw-linux.h new file mode 100644 index 0000000..ca463f4 --- /dev/null +++ b/net/raw-linux.h @@ -0,0 +1,42 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_RAW_LINUX_H +#define QEMU_RAW_LINUX_H + +#include <stdint.h> + +#define PACKET_VNET_HDR 15 + +struct virtio_net_hdr +{ + uint8_t flags; + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; +}; + +#endif /* QEMU_RAW_LINUX_H */ diff --git a/net/raw.c b/net/raw.c new file mode 100644 index 0000000..9dbc2f4 --- /dev/null +++ b/net/raw.c @@ -0,0 +1,362 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "net/raw.h" + +#include "config-host.h" + +#include <signal.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <net/if.h> + +#include "net.h" +#include "sysemu.h" +#include "qemu-char.h" +#include "qemu-common.h" + +#include "net/raw-linux.h" + +/* Maximum GSO packet size (64k) plus plenty of room for + * the ethernet and virtio_net headers + */ +#define RAW_BUFSIZE (4096 + 65536) + +typedef struct RAWState { + VLANClientState nc; + int fd; + uint8_t buf[RAW_BUFSIZE]; + int promisc; + unsigned int read_poll:1; + unsigned int write_poll:1; + unsigned int has_vnet_hdr:1; + unsigned int using_vnet_hdr:1; + unsigned int has_ufo:1; +} RAWState; + +static int raw_can_send(void *opaque); +static void raw_send(void *opaque); +static void raw_writable(void *opaque); + +static void raw_update_fd_handler(RAWState *s) +{ + qemu_set_fd_handler2(s->fd, + s->read_poll ? raw_can_send : NULL, + s->read_poll ? raw_send : NULL, + s->write_poll ? raw_writable : NULL, + s); +} + +static void raw_read_poll(RAWState *s, int enable) +{ + s->read_poll = !!enable; + raw_update_fd_handler(s); +} + +static void raw_write_poll(RAWState *s, int enable) +{ + s->write_poll = !!enable; + raw_update_fd_handler(s); +} + +static void raw_writable(void *opaque) +{ + RAWState *s = opaque; + + raw_write_poll(s, 0); + qemu_flush_queued_packets(&s->nc); +} + +static ssize_t raw_write_packet(RAWState *s, const struct iovec *iov, int iovcnt) +{ + ssize_t len; + + do { + len = writev(s->fd, iov, iovcnt); + } while (len == -1 && errno == EINTR); + + if (len == -1 && errno == EAGAIN) { + raw_write_poll(s, 1); + return 0; + } + + if (len == -1) + printf("raw_write_packet: errno:%d\n", errno); + + return len; +} + +static ssize_t raw_receive_iov(VLANClientState *nc, const struct iovec *iov, + int iovcnt) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + const struct iovec *iovp = iov; + struct iovec iov_copy[iovcnt + 1]; + struct virtio_net_hdr hdr = { 0, }; + + if (s->has_vnet_hdr && !s->using_vnet_hdr) { + iov_copy[0].iov_base = &hdr; + iov_copy[0].iov_len = sizeof(hdr); + memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov)); + iovp = iov_copy; + iovcnt++; + } + + return raw_write_packet(s, iovp, iovcnt); +} + +static ssize_t raw_receive_raw(VLANClientState *nc, const uint8_t *buf, size_t size) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + struct iovec iov[2]; + int iovcnt = 0; + struct virtio_net_hdr hdr = { 0, }; + + if (s->has_vnet_hdr) { + iov[iovcnt].iov_base = &hdr; + iov[iovcnt].iov_len = sizeof(hdr); + iovcnt++; + } + + iov[iovcnt].iov_base = (char *)buf; + iov[iovcnt].iov_len = size; + iovcnt++; + + return raw_write_packet(s, iov, iovcnt); +} + +static ssize_t raw_receive(VLANClientState *nc, const uint8_t *buf, size_t size) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + struct iovec iov[1]; + + if (s->has_vnet_hdr && !s->using_vnet_hdr) + return raw_receive_raw(nc, buf, size); + + iov[0].iov_base = (char *)buf; + iov[0].iov_len = size; + + return raw_write_packet(s, iov, 1); +} + +static int raw_can_send(void *opaque) +{ + RAWState *s = opaque; + + return qemu_can_send_packet(&s->nc); +} + +ssize_t raw_read_packet(int rawfd, uint8_t *buf, int maxlen, int flags) +{ + int ret; + + ret = recv(rawfd, buf, maxlen, flags); + return ret; +} + +static void raw_send_completed(VLANClientState *nc, ssize_t len) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + + raw_read_poll(s, 1); +} + +static void raw_send(void *opaque) +{ + RAWState *s = opaque; + int size; + + do { + uint8_t *buf = s->buf; + + size = raw_read_packet(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); + if (size <= 0) + break; + + if (s->has_vnet_hdr && !s->using_vnet_hdr) { + buf += sizeof(struct virtio_net_hdr); + size -= sizeof(struct virtio_net_hdr); + } + + size = qemu_send_packet_async(&s->nc, buf, size, raw_send_completed); + if (size == 0) + raw_read_poll(s, 0); + + } while (size > 0 && qemu_can_send_packet(&s->nc)); +} + +int raw_has_ufo(VLANClientState *nc) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + + assert(nc->info->type == NET_CLIENT_TYPE_RAW); + + return s->has_ufo; +} + +int raw_has_vnet_hdr(VLANClientState *nc) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + + assert(nc->info->type == NET_CLIENT_TYPE_RAW); + + return s->has_vnet_hdr; +} + +void raw_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + + using_vnet_hdr = using_vnet_hdr != 0; + + assert(nc->info->type == NET_CLIENT_TYPE_RAW); + assert(s->has_vnet_hdr == using_vnet_hdr); + + s->using_vnet_hdr = using_vnet_hdr; +} + +void raw_set_offload(VLANClientState *nc, int csum, int tso4, + int tso6, int ecn, int ufo) +{ + return; +} + +static void raw_cleanup(VLANClientState *nc) +{ + RAWState *s = DO_UPCAST(RAWState, nc, nc); + + qemu_purge_queued_packets(nc); + + raw_read_poll(s, 0); + raw_write_poll(s, 0); + close(s->fd); +} + +/* fd support */ + +static NetClientInfo net_raw_info = { + .type = NET_CLIENT_TYPE_RAW, + .size = sizeof(RAWState), + .receive = raw_receive, + .receive_raw = NULL, + .receive_iov = raw_receive_iov, + .cleanup = raw_cleanup, +}; + + +static RAWState *net_raw_fd_init(VLANState *vlan, const char *model, + const char *name, int fd, int vnet_hdr) +{ + VLANClientState *nc; + RAWState *s; + + nc = qemu_new_net_client(&net_raw_info, vlan, NULL, model, name); + + s = DO_UPCAST(RAWState, nc, nc); + + s->fd = fd; + s->has_vnet_hdr = vnet_hdr != 0; + s->using_vnet_hdr = 0; + s->has_ufo = 1; + raw_read_poll(s, 1); + + return s; +} + +static int net_raw_init(QemuOpts *opts, int *vnet_hdr) +{ + int fd, vnet_hdr_required; + char ifname[128] = {0,}; + + if (qemu_opt_get(opts, "ifname")) { + pstrcpy(ifname, sizeof(ifname), qemu_opt_get(opts, "ifname")); + } + + *vnet_hdr = qemu_opt_get_bool(opts, "vnet_hdr", 1); + if (qemu_opt_get(opts, "vnet_hdr")) { + vnet_hdr_required = *vnet_hdr; + } else { + vnet_hdr_required = 0; + } + + TFR(fd = raw_open(ifname, sizeof(ifname), vnet_hdr, vnet_hdr_required)); + if (fd < 0) + return -1; + + qemu_opt_set(opts, "ifname", ifname); + + return fd; +} + +int net_init_raw(QemuOpts *opts, Monitor *mon, const char *name, + VLANState *vlan) +{ + RAWState *s; + int fd, vnet_hdr = 0; + + if (qemu_opt_get(opts, "fd")) { + if (qemu_opt_get(opts, "ifname")) { + qemu_error("ifname=, is invalid with fd=\n"); + return -1; + } + + fd = net_handle_fd_param(mon, qemu_opt_get(opts, "fd")); + if (fd == -1) { + return -1; + } + + fcntl(fd, F_SETFL, O_NONBLOCK); + + vnet_hdr = raw_probe_vnet_hdr(fd); + } else { + fd = net_raw_init(opts, &vnet_hdr); + if (fd == -1) { + return -1; + } + } + + s = net_raw_fd_init(vlan, "raw", name, fd, vnet_hdr); + if (!s) { + close(fd); + return -1; + } + + if (qemu_opt_get(opts, "fd")) { + snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd); + } else { + const char *ifname; + + ifname = qemu_opt_get(opts, "ifname"); + snprintf(s->nc.info_str, sizeof(s->nc.info_str), "ifname=%s", ifname); + + } + + if (vlan) { + vlan->nb_host_devs++; + } + + return 0; +} diff --git a/net/raw.h b/net/raw.h new file mode 100644 index 0000000..7260080 --- /dev/null +++ b/net/raw.h @@ -0,0 +1,40 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_NET_RAW_H +#define QEMU_NET_RAW_H + +#include "qemu-common.h" +#include "qemu-option.h" + +int net_init_raw(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan); +int raw_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required); +ssize_t raw_read_packet(int rawfd, uint8_t *buf, int maxlen, int flags); +int raw_has_ufo(VLANClientState *vc); +int raw_has_vnet_hdr(VLANClientState *vc); +void raw_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr); +int raw_probe_vnet_hdr(int fd); +void raw_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, int ecn, int ufo); + +#endif /* QEMU_NET_RAW_H */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html