On Thu, Apr 01, 2010 at 05:27:18PM +0800, Xin Xiaohui wrote: > Add a device to utilize the vhost-net backend driver for > copy-less data transfer between guest FE and host NIC. > It pins the guest user space to the host memory and > provides proto_ops as sendmsg/recvmsg to vhost-net. > > Signed-off-by: Xin Xiaohui <xiaohui.xin@xxxxxxxxx> > Signed-off-by: Zhao Yu <yzhao81@xxxxxxxxx> > Sigend-off-by: Jeff Dike <jdike@xxxxxxxxxxxxxxxxxxxxxx> > --- > > Micheal, > Sorry, I did not resolve all your comments this time. > I did not move the device out of vhost directory because I > did not implement real asynchronous read/write operations > to mp device for now, We wish we can do this after the network > code checked in. Well, placement of code is not such a major issue. It's just that code under drivers/net gets more and better review than drivers/vhost. I'll try to get Dave's opinion. > > For the DOS issue, I'm not sure how much the limit get_user_pages() > can pin is reasonable, should we compute the bindwidth to make it? There's a ulimit for locked memory. Can we use this, decreasing the value for rlimit array? We can do this when backend is enabled and re-increment when backend is disabled. > We use get_user_pages_fast() and use set_page_dirty_lock(). > Remove read_rcu_lock()/unlock(), since the ctor pointer is > only changed by BIND/UNBIND ioctl, and during that time, > the NIC is always stoped, all outstanding requests are done, > so the ctor pointer cannot be raced into wrong condition. > > Qemu needs a userspace write, is that a synchronous one or > asynchronous one? It's a synchronous non-blocking write. > Thanks > Xiaohui > > drivers/vhost/Kconfig | 5 + > drivers/vhost/Makefile | 2 + > drivers/vhost/mpassthru.c | 1162 +++++++++++++++++++++++++++++++++++++++++++++ > include/linux/mpassthru.h | 29 ++ > 4 files changed, 1198 insertions(+), 0 deletions(-) > create mode 100644 drivers/vhost/mpassthru.c > create mode 100644 include/linux/mpassthru.h > > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig > index 9f409f4..ee32a3b 100644 > --- a/drivers/vhost/Kconfig > +++ b/drivers/vhost/Kconfig > @@ -9,3 +9,8 @@ config VHOST_NET > To compile this driver as a module, choose M here: the module will > be called vhost_net. > > +config VHOST_PASSTHRU > + tristate "Zerocopy network driver (EXPERIMENTAL)" > + depends on VHOST_NET > + ---help--- > + zerocopy network I/O support > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile > index 72dd020..3f79c79 100644 > --- a/drivers/vhost/Makefile > +++ b/drivers/vhost/Makefile > @@ -1,2 +1,4 @@ > obj-$(CONFIG_VHOST_NET) += vhost_net.o > vhost_net-y := vhost.o net.o > + > +obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o > diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c > new file mode 100644 > index 0000000..6e8fc4d > --- /dev/null > +++ b/drivers/vhost/mpassthru.c > @@ -0,0 +1,1162 @@ > +/* > + * MPASSTHRU - Mediate passthrough device. > + * Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + */ > + > +#define DRV_NAME "mpassthru" > +#define DRV_DESCRIPTION "Mediate passthru device driver" > +#define DRV_COPYRIGHT "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G" > + > +#include <linux/module.h> > +#include <linux/errno.h> > +#include <linux/kernel.h> > +#include <linux/major.h> > +#include <linux/slab.h> > +#include <linux/smp_lock.h> > +#include <linux/poll.h> > +#include <linux/fcntl.h> > +#include <linux/init.h> > +#include <linux/aio.h> > + > +#include <linux/skbuff.h> > +#include <linux/netdevice.h> > +#include <linux/etherdevice.h> > +#include <linux/miscdevice.h> > +#include <linux/ethtool.h> > +#include <linux/rtnetlink.h> > +#include <linux/if.h> > +#include <linux/if_arp.h> > +#include <linux/if_ether.h> > +#include <linux/crc32.h> > +#include <linux/nsproxy.h> > +#include <linux/uaccess.h> > +#include <linux/virtio_net.h> > +#include <linux/mpassthru.h> > +#include <net/net_namespace.h> > +#include <net/netns/generic.h> > +#include <net/rtnetlink.h> > +#include <net/sock.h> > + > +#include <asm/system.h> > + > +#include "vhost.h" > + > +/* Uncomment to enable debugging */ > +/* #define MPASSTHRU_DEBUG 1 */ > + > +#ifdef MPASSTHRU_DEBUG > +static int debug; > + > +#define DBG if (mp->debug) printk > +#define DBG1 if (debug == 2) printk > +#else > +#define DBG(a...) > +#define DBG1(a...) > +#endif > + > +#define COPY_THRESHOLD (L1_CACHE_BYTES * 4) > +#define COPY_HDR_LEN (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES) > + > +struct frag { > + u16 offset; > + u16 size; > +}; > + > +struct page_ctor { > + struct list_head readq; > + int w_len; > + int r_len; > + spinlock_t read_lock; > + struct kmem_cache *cache; > + struct net_device *dev; > + struct mpassthru_port port; > +}; > + > +struct page_info { > + void *ctrl; > + struct list_head list; > + int header; > + /* indicate the actual length of bytes > + * send/recv in the user space buffers > + */ > + int total; > + int offset; > + struct page *pages[MAX_SKB_FRAGS+1]; > + struct skb_frag_struct frag[MAX_SKB_FRAGS+1]; > + struct sk_buff *skb; > + struct page_ctor *ctor; > + > + /* The pointer relayed to skb, to indicate > + * it's a user space allocated skb or kernel > + */ > + struct skb_user_page user; > + struct skb_shared_info ushinfo; > + > +#define INFO_READ 0 > +#define INFO_WRITE 1 > + unsigned flags; > + unsigned pnum; > + > + /* It's meaningful for receive, means > + * the max length allowed > + */ > + size_t len; > + > + /* The fields after that is for backend > + * driver, now for vhost-net. > + */ > + > + struct kiocb *iocb; > + unsigned int desc_pos; > + unsigned int log; > + struct iovec hdr[VHOST_NET_MAX_SG]; > + struct iovec iov[VHOST_NET_MAX_SG]; > +}; > + > +struct mp_struct { > + struct mp_file *mfile; > + struct net_device *dev; > + struct page_ctor *ctor; > + struct socket socket; > + > +#ifdef MPASSTHRU_DEBUG > + int debug; > +#endif > +}; > + > +struct mp_file { > + atomic_t count; > + struct mp_struct *mp; > + struct net *net; > +}; > + > +struct mp_sock { > + struct sock sk; > + struct mp_struct *mp; > +}; > + > +static int mp_dev_change_flags(struct net_device *dev, unsigned flags) > +{ > + int ret = 0; > + > + rtnl_lock(); > + ret = dev_change_flags(dev, flags); > + rtnl_unlock(); > + > + if (ret < 0) > + printk(KERN_ERR "failed to change dev state of %s", dev->name); > + > + return ret; > +} > + > +/* The main function to allocate user space buffers */ > +static struct skb_user_page *page_ctor(struct mpassthru_port *port, > + struct sk_buff *skb, int npages) > +{ > + int i; > + unsigned long flags; > + struct page_ctor *ctor; > + struct page_info *info = NULL; > + > + ctor = container_of(port, struct page_ctor, port); > + > + spin_lock_irqsave(&ctor->read_lock, flags); > + if (!list_empty(&ctor->readq)) { > + info = list_first_entry(&ctor->readq, struct page_info, list); > + list_del(&info->list); > + } > + spin_unlock_irqrestore(&ctor->read_lock, flags); > + if (!info) > + return NULL; > + > + for (i = 0; i < info->pnum; i++) { > + get_page(info->pages[i]); > + info->frag[i].page = info->pages[i]; > + info->frag[i].page_offset = i ? 0 : info->offset; > + info->frag[i].size = port->npages > 1 ? PAGE_SIZE : > + port->data_len; > + } > + info->skb = skb; > + info->user.frags = info->frag; > + info->user.ushinfo = &info->ushinfo; > + return &info->user; > +} > + > +static void mp_ki_dtor(struct kiocb *iocb) > +{ > + struct page_info *info = (struct page_info *)(iocb->private); > + int i; > + > + for (i = 0; i < info->pnum; i++) { > + if (info->pages[i]) > + put_page(info->pages[i]); > + } > + > + if (info->flags == INFO_READ) { > + skb_shinfo(info->skb)->destructor_arg = &info->user; > + info->skb->destructor = NULL; > + kfree_skb(info->skb); > + } > + > + kmem_cache_free(info->ctor->cache, info); > + > + return; > +} > + > +static struct kiocb *create_iocb(struct page_info *info, int size) > +{ > + struct kiocb *iocb = NULL; > + > + iocb = info->iocb; > + if (!iocb) > + return iocb; > + iocb->ki_flags = 0; > + iocb->ki_users = 1; > + iocb->ki_key = 0; > + iocb->ki_ctx = NULL; > + iocb->ki_cancel = NULL; > + iocb->ki_retry = NULL; > + iocb->ki_iovec = NULL; > + iocb->ki_eventfd = NULL; > + iocb->private = (void *)info; > + iocb->ki_pos = info->desc_pos; > + iocb->ki_nbytes = size; > + iocb->ki_user_data = info->log; > + iocb->ki_dtor = mp_ki_dtor; > + return iocb; > +} > + > +/* A helper to clean the skb before the kfree_skb() */ > + > +static void page_dtor_prepare(struct page_info *info) > +{ > + if (info->flags == INFO_READ) > + if (info->skb) > + info->skb->head = NULL; > +} > + > +/* The callback to destruct the user space buffers or skb */ > +static void page_dtor(struct skb_user_page *user) > +{ > + struct page_info *info; > + struct page_ctor *ctor; > + struct sock *sk; > + struct sk_buff *skb; > + struct kiocb *iocb = NULL; > + struct vhost_virtqueue *vq = NULL; > + unsigned long flags; > + int i; > + > + if (!user) > + return; > + info = container_of(user, struct page_info, user); > + if (!info) > + return; > + ctor = info->ctor; > + skb = info->skb; > + > + page_dtor_prepare(info); > + > + /* If the info->total is 0, make it to be reused */ > + if (!info->total) { > + spin_lock_irqsave(&ctor->read_lock, flags); > + list_add(&info->list, &ctor->readq); > + spin_unlock_irqrestore(&ctor->read_lock, flags); > + return; > + } > + > + if (info->flags == INFO_READ) > + return; > + > + /* For transmit, we should wait for the DMA finish by hardware. > + * Queue the notifier to wake up the backend driver > + */ > + vq = (struct vhost_virtqueue *)info->ctrl; > + iocb = create_iocb(info, info->total); > + > + spin_lock_irqsave(&vq->notify_lock, flags); > + list_add_tail(&iocb->ki_list, &vq->notifier); > + spin_unlock_irqrestore(&vq->notify_lock, flags); > + > + sk = ctor->port.sock->sk; > + sk->sk_write_space(sk); > + > + return; > +} > + > +static int page_ctor_attach(struct mp_struct *mp) > +{ > + int rc; > + struct page_ctor *ctor; > + struct net_device *dev = mp->dev; > + > + /* locked by mp_mutex */ > + if (rcu_dereference(mp->ctor)) > + return -EBUSY; > + > + ctor = kzalloc(sizeof(*ctor), GFP_KERNEL); > + if (!ctor) > + return -ENOMEM; > + rc = netdev_mp_port_prep(dev, &ctor->port); > + if (rc) > + goto fail; > + > + ctor->cache = kmem_cache_create("skb_page_info", > + sizeof(struct page_info), 0, > + SLAB_HWCACHE_ALIGN, NULL); > + > + if (!ctor->cache) > + goto cache_fail; > + > + INIT_LIST_HEAD(&ctor->readq); > + spin_lock_init(&ctor->read_lock); > + > + ctor->w_len = 0; > + ctor->r_len = 0; > + > + dev_hold(dev); > + ctor->dev = dev; > + ctor->port.ctor = page_ctor; > + ctor->port.sock = &mp->socket; > + > + rc = netdev_mp_port_attach(dev, &ctor->port); > + if (rc) > + goto fail; > + > + /* locked by mp_mutex */ > + rcu_assign_pointer(mp->ctor, ctor); > + > + /* XXX:Need we do set_offload here ? */ > + > + return 0; > + > +fail: > + kmem_cache_destroy(ctor->cache); > +cache_fail: > + kfree(ctor); > + dev_put(dev); > + > + return rc; > +} > + > +struct page_info *info_dequeue(struct page_ctor *ctor) > +{ > + unsigned long flags; > + struct page_info *info = NULL; > + spin_lock_irqsave(&ctor->read_lock, flags); > + if (!list_empty(&ctor->readq)) { > + info = list_first_entry(&ctor->readq, > + struct page_info, list); > + list_del(&info->list); > + } > + spin_unlock_irqrestore(&ctor->read_lock, flags); > + return info; > +} > + > +static int page_ctor_detach(struct mp_struct *mp) > +{ > + struct page_ctor *ctor; > + struct page_info *info; > + struct vhost_virtqueue *vq = NULL; > + struct kiocb *iocb = NULL; > + int i; > + unsigned long flags; > + > + /* locked by mp_mutex */ > + ctor = rcu_dereference(mp->ctor); > + if (!ctor) > + return -ENODEV; > + > + while ((info = info_dequeue(ctor))) { > + for (i = 0; i < info->pnum; i++) > + if (info->pages[i]) > + put_page(info->pages[i]); > + vq = (struct vhost_virtqueue *)(info->ctrl); > + iocb = create_iocb(info, 0); > + > + spin_lock_irqsave(&vq->notify_lock, flags); > + list_add_tail(&iocb->ki_list, &vq->notifier); > + spin_unlock_irqrestore(&vq->notify_lock, flags); > + > + kmem_cache_free(ctor->cache, info); > + } > + kmem_cache_destroy(ctor->cache); > + netdev_mp_port_detach(ctor->dev); > + dev_put(ctor->dev); > + > + /* locked by mp_mutex */ > + rcu_assign_pointer(mp->ctor, NULL); > + synchronize_rcu(); > + > + kfree(ctor); > + return 0; > +} > + > +/* For small user space buffers transmit, we don't need to call > + * get_user_pages(). > + */ > +static struct page_info *alloc_small_page_info(struct page_ctor *ctor, > + struct kiocb *iocb, int total) > +{ > + struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL); > + > + if (!info) > + return NULL; > + info->total = total; > + info->user.dtor = page_dtor; > + info->ctor = ctor; > + info->flags = INFO_WRITE; > + info->iocb = iocb; > + return info; > +} > + > +/* The main function to transform the guest user space address > + * to host kernel address via get_user_pages(). Thus the hardware > + * can do DMA directly to the user space address. > + */ > +static struct page_info *alloc_page_info(struct page_ctor *ctor, > + struct kiocb *iocb, struct iovec *iov, > + int count, struct frag *frags, > + int npages, int total) > +{ > + int rc; > + int i, j, n = 0; > + int len; > + unsigned long base; > + struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL); > + > + if (!info) > + return NULL; > + > + for (i = j = 0; i < count; i++) { > + base = (unsigned long)iov[i].iov_base; > + len = iov[i].iov_len; > + > + if (!len) > + continue; > + n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; > + > + rc = get_user_pages_fast(base, n, npages ? 1 : 0, > + &info->pages[j]); > + if (rc != n) > + goto failed; > + > + while (n--) { > + frags[j].offset = base & ~PAGE_MASK; > + frags[j].size = min_t(int, len, > + PAGE_SIZE - frags[j].offset); > + len -= frags[j].size; > + base += frags[j].size; > + j++; > + } > + } > + > +#ifdef CONFIG_HIGHMEM > + if (npages && !(dev->features & NETIF_F_HIGHDMA)) { > + for (i = 0; i < j; i++) { > + if (PageHighMem(info->pages[i])) > + goto failed; > + } > + } > +#endif > + > + info->total = total; > + info->user.dtor = page_dtor; > + info->ctor = ctor; > + info->pnum = j; > + info->iocb = iocb; > + if (!npages) > + info->flags = INFO_WRITE; > + if (info->flags == INFO_READ) { > + info->user.start = (u8 *)(((unsigned long) > + (pfn_to_kaddr(page_to_pfn(info->pages[0]))) + > + frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD); > + info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD; > + for (i = 0; i < j; i++) > + set_page_dirty_lock(info->pages[i]); > + } > + return info; > + > +failed: > + for (i = 0; i < j; i++) > + put_page(info->pages[i]); > + > + kmem_cache_free(ctor->cache, info); > + > + return NULL; > +} > + > +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock, > + struct msghdr *m, size_t total_len) > +{ > + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp; > + struct page_ctor *ctor; > + struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private); > + struct iovec *iov = m->msg_iov; > + struct page_info *info = NULL; > + struct frag frags[MAX_SKB_FRAGS]; > + struct sk_buff *skb; > + int count = m->msg_iovlen; > + int total = 0, header, n, i, len, rc; > + unsigned long base; > + > + ctor = rcu_dereference(mp->ctor); > + if (!ctor) > + return -ENODEV; > + > + total = iov_length(iov, count); > + > + if (total < ETH_HLEN) > + return -EINVAL; > + > + if (total <= COPY_THRESHOLD) > + goto copy; > + > + n = 0; > + for (i = 0; i < count; i++) { > + base = (unsigned long)iov[i].iov_base; > + len = iov[i].iov_len; > + if (!len) > + continue; > + n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; > + if (n > MAX_SKB_FRAGS) > + return -EINVAL; > + } > + > +copy: > + header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total; > + > + skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC); > + if (!skb) > + goto drop; > + > + skb_reserve(skb, NET_IP_ALIGN); > + > + skb_set_network_header(skb, ETH_HLEN); > + > + memcpy_fromiovec(skb->data, iov, header); > + skb_put(skb, header); > + skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN); > + > + if (header == total) { > + rc = total; > + info = alloc_small_page_info(ctor, iocb, total); > + } else { > + info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total); > + if (info) > + for (i = 0; info->pages[i]; i++) { > + skb_add_rx_frag(skb, i, info->pages[i], > + frags[i].offset, frags[i].size); > + info->pages[i] = NULL; > + } > + } > + if (info != NULL) { > + info->desc_pos = iocb->ki_pos; > + info->ctrl = vq; > + info->total = total; > + info->skb = skb; > + skb_shinfo(skb)->destructor_arg = &info->user; > + skb->dev = mp->dev; > + dev_queue_xmit(skb); > + mp->dev->stats.tx_packets++; > + mp->dev->stats.tx_bytes += total; > + return 0; > + } > +drop: > + kfree_skb(skb); > + if (info) { > + for (i = 0; info->pages[i]; i++) > + put_page(info->pages[i]); > + kmem_cache_free(info->ctor->cache, info); > + } > + mp->dev->stats.tx_dropped++; > + return -ENOMEM; > +} > + > + > +static void mp_recvmsg_notify(struct vhost_virtqueue *vq) > +{ > + struct socket *sock = vq->private_data; > + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp; > + struct page_ctor *ctor = NULL; > + struct sk_buff *skb = NULL; > + struct page_info *info = NULL; > + struct ethhdr *eth; > + struct kiocb *iocb = NULL; > + int len, i; > + unsigned long flags; > + > + struct virtio_net_hdr hdr = { > + .flags = 0, > + .gso_type = VIRTIO_NET_HDR_GSO_NONE > + }; > + > + ctor = rcu_dereference(mp->ctor); > + if (!ctor) > + return; > + > + while ((skb = skb_dequeue(&sock->sk->sk_receive_queue)) != NULL) { > + if (skb_shinfo(skb)->destructor_arg) { > + info = container_of(skb_shinfo(skb)->destructor_arg, > + struct page_info, user); > + info->skb = skb; > + if (skb->len > info->len) { > + mp->dev->stats.rx_dropped++; > + DBG(KERN_INFO "Discarded truncated rx packet: " > + " len %d > %zd\n", skb->len, info->len); > + info->total = skb->len; > + goto clean; > + } else { > + int i; > + struct skb_shared_info *gshinfo = > + (struct skb_shared_info *)(&info->ushinfo); > + struct skb_shared_info *hshinfo = > + skb_shinfo(skb); > + > + if (gshinfo->nr_frags < hshinfo->nr_frags) > + goto clean; > + eth = eth_hdr(skb); > + skb_push(skb, ETH_HLEN); > + > + hdr.hdr_len = skb_headlen(skb); > + info->total = skb->len; > + > + for (i = 0; i < gshinfo->nr_frags; i++) > + gshinfo->frags[i].size = 0; > + for (i = 0; i < hshinfo->nr_frags; i++) > + gshinfo->frags[i].size = > + hshinfo->frags[i].size; > + memcpy(skb_shinfo(skb), &info->ushinfo, > + sizeof(struct skb_shared_info)); > + } > + } else { > + /* The skb composed with kernel buffers > + * in case user space buffers are not sufficent. > + * The case should be rare. > + */ > + unsigned long flags; > + int i; > + struct skb_shared_info *gshinfo = NULL; > + > + info = NULL; > + > + spin_lock_irqsave(&ctor->read_lock, flags); > + if (!list_empty(&ctor->readq)) { > + info = list_first_entry(&ctor->readq, > + struct page_info, list); > + list_del(&info->list); > + } > + spin_unlock_irqrestore(&ctor->read_lock, flags); > + if (!info) { > + DBG(KERN_INFO "No user buffer avaliable %p\n", > + skb); > + skb_queue_head(&sock->sk->sk_receive_queue, > + skb); > + break; > + } > + info->skb = skb; > + /* compute the guest skb frags info */ > + gshinfo = (struct skb_shared_info *)(info->user.start + > + SKB_DATA_ALIGN(info->user.size)); > + > + if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags) > + goto clean; > + > + eth = eth_hdr(skb); > + skb_push(skb, ETH_HLEN); > + info->total = skb->len; > + > + for (i = 0; i < gshinfo->nr_frags; i++) > + gshinfo->frags[i].size = 0; > + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > + gshinfo->frags[i].size = > + skb_shinfo(skb)->frags[i].size; > + hdr.hdr_len = min_t(int, skb->len, > + info->iov[1].iov_len); > + skb_copy_datagram_iovec(skb, 0, info->iov, skb->len); > + } > + > + len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr, > + sizeof hdr); > + if (len) { > + DBG(KERN_INFO > + "Unable to write vnet_hdr at addr %p: %d\n", > + info->hdr->iov_base, len); > + goto clean; > + } > + iocb = create_iocb(info, skb->len + sizeof(hdr)); > + > + spin_lock_irqsave(&vq->notify_lock, flags); > + list_add_tail(&iocb->ki_list, &vq->notifier); > + spin_unlock_irqrestore(&vq->notify_lock, flags); > + continue; > + > +clean: > + kfree_skb(skb); > + for (i = 0; info->pages[i]; i++) > + put_page(info->pages[i]); > + kmem_cache_free(ctor->cache, info); > + } > + return; > +} > + > +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock, > + struct msghdr *m, size_t total_len, > + int flags) > +{ > + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp; > + struct page_ctor *ctor; > + struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private); > + struct iovec *iov = m->msg_iov; > + int count = m->msg_iovlen; > + int npages, payload; > + struct page_info *info; > + struct frag frags[MAX_SKB_FRAGS]; > + unsigned long base; > + int i, len; > + unsigned long flag; > + > + if (!(flags & MSG_DONTWAIT)) > + return -EINVAL; > + > + ctor = rcu_dereference(mp->ctor); > + if (!ctor) > + return -EINVAL; > + > + /* Error detections in case invalid user space buffer */ > + if (count > 2 && iov[1].iov_len < ctor->port.hdr_len && > + mp->dev->features & NETIF_F_SG) { > + return -EINVAL; > + } > + > + npages = ctor->port.npages; > + payload = ctor->port.data_len; > + > + /* If KVM guest virtio-net FE driver use SG feature */ > + if (count > 2) { > + for (i = 2; i < count; i++) { > + base = (unsigned long)iov[i].iov_base & ~PAGE_MASK; > + len = iov[i].iov_len; > + if (npages == 1) > + len = min_t(int, len, PAGE_SIZE - base); > + else if (base) > + break; > + payload -= len; > + if (payload <= 0) > + goto proceed; > + if (npages == 1 || (len & ~PAGE_MASK)) > + break; > + } > + } > + > + if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK) > + - NET_SKB_PAD - NET_IP_ALIGN) >= 0) > + goto proceed; > + > + return -EINVAL; > + > +proceed: > + /* skip the virtnet head */ > + iov++; > + count--; > + > + /* Translate address to kernel */ > + info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0); > + if (!info) > + return -ENOMEM; > + info->len = total_len; > + info->hdr[0].iov_base = vq->hdr[0].iov_base; > + info->hdr[0].iov_len = vq->hdr[0].iov_len; > + info->offset = frags[0].offset; > + info->desc_pos = iocb->ki_pos; > + info->log = iocb->ki_user_data; > + info->ctrl = vq; > + > + iov--; > + count++; > + > + memcpy(info->iov, vq->iov, sizeof(struct iovec) * count); > + > + spin_lock_irqsave(&ctor->read_lock, flag); > + list_add_tail(&info->list, &ctor->readq); > + spin_unlock_irqrestore(&ctor->read_lock, flag); > + > + if (!vq->receiver) > + vq->receiver = mp_recvmsg_notify; > + > + return 0; > +} > + > +static void __mp_detach(struct mp_struct *mp) > +{ > + mp->mfile = NULL; > + > + mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP); > + page_ctor_detach(mp); > + mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP); > + > + /* Drop the extra count on the net device */ > + dev_put(mp->dev); > +} > + > +static DEFINE_MUTEX(mp_mutex); > + > +static void mp_detach(struct mp_struct *mp) > +{ > + mutex_lock(&mp_mutex); > + __mp_detach(mp); > + mutex_unlock(&mp_mutex); > +} > + > +static void mp_put(struct mp_file *mfile) > +{ > + if (atomic_dec_and_test(&mfile->count)) > + mp_detach(mfile->mp); > +} > + > +static int mp_release(struct socket *sock) > +{ > + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp; > + struct mp_file *mfile = mp->mfile; > + > + mp_put(mfile); > + sock_put(mp->socket.sk); > + put_net(mfile->net); > + > + return 0; > +} > + > +/* Ops structure to mimic raw sockets with mp device */ > +static const struct proto_ops mp_socket_ops = { > + .sendmsg = mp_sendmsg, > + .recvmsg = mp_recvmsg, > + .release = mp_release, > +}; > + > +static struct proto mp_proto = { > + .name = "mp", > + .owner = THIS_MODULE, > + .obj_size = sizeof(struct mp_sock), > +}; > + > +static int mp_chr_open(struct inode *inode, struct file * file) > +{ > + struct mp_file *mfile; > + cycle_kernel_lock(); > + DBG1(KERN_INFO "mp: mp_chr_open\n"); > + > + mfile = kzalloc(sizeof(*mfile), GFP_KERNEL); > + if (!mfile) > + return -ENOMEM; > + atomic_set(&mfile->count, 0); > + mfile->mp = NULL; > + mfile->net = get_net(current->nsproxy->net_ns); > + file->private_data = mfile; > + return 0; > +} > + > + > +static struct mp_struct *mp_get(struct mp_file *mfile) > +{ > + struct mp_struct *mp = NULL; > + if (atomic_inc_not_zero(&mfile->count)) > + mp = mfile->mp; > + > + return mp; > +} > + > + > +static int mp_attach(struct mp_struct *mp, struct file *file) > +{ > + struct mp_file *mfile = file->private_data; > + int err; > + > + netif_tx_lock_bh(mp->dev); > + > + err = -EINVAL; > + > + if (mfile->mp) > + goto out; > + > + err = -EBUSY; > + if (mp->mfile) > + goto out; > + > + err = 0; > + mfile->mp = mp; > + mp->mfile = mfile; > + mp->socket.file = file; > + dev_hold(mp->dev); > + sock_hold(mp->socket.sk); > + atomic_inc(&mfile->count); > + > +out: > + netif_tx_unlock_bh(mp->dev); > + return err; > +} > + > +static void mp_sock_destruct(struct sock *sk) > +{ > + struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp; > + kfree(mp); > +} > + > +static int do_unbind(struct mp_file *mfile) > +{ > + struct mp_struct *mp = mp_get(mfile); > + > + if (!mp) > + return -EINVAL; > + > + mp_detach(mp); > + sock_put(mp->socket.sk); > + mp_put(mfile); > + return 0; > +} > + > +static void mp_sock_data_ready(struct sock *sk, int len) > +{ > + if (sk_has_sleeper(sk)) > + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN); > +} > + > +static void mp_sock_write_space(struct sock *sk) > +{ > + if (sk_has_sleeper(sk)) > + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT); > +} > + > +static long mp_chr_ioctl(struct file *file, unsigned int cmd, > + unsigned long arg) > +{ > + struct mp_file *mfile = file->private_data; > + struct mp_struct *mp; > + struct net_device *dev; > + void __user* argp = (void __user *)arg; > + struct ifreq ifr; > + struct sock *sk; > + int ret; > + > + ret = -EINVAL; > + > + switch (cmd) { > + case MPASSTHRU_BINDDEV: > + ret = -EFAULT; > + if (copy_from_user(&ifr, argp, sizeof ifr)) > + break; > + > + ifr.ifr_name[IFNAMSIZ-1] = '\0'; > + > + ret = -EBUSY; > + > + if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL) > + break; > + > + ret = -ENODEV; > + dev = dev_get_by_name(mfile->net, ifr.ifr_name); > + if (!dev) > + break; > + > + mutex_lock(&mp_mutex); > + > + ret = -EBUSY; > + mp = mfile->mp; > + if (mp) > + goto err_dev_put; > + > + mp = kzalloc(sizeof(*mp), GFP_KERNEL); > + if (!mp) { > + ret = -ENOMEM; > + goto err_dev_put; > + } > + mp->dev = dev; > + ret = -ENOMEM; > + > + sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto); > + if (!sk) > + goto err_free_mp; > + > + init_waitqueue_head(&mp->socket.wait); > + mp->socket.ops = &mp_socket_ops; > + sock_init_data(&mp->socket, sk); > + sk->sk_sndbuf = INT_MAX; > + container_of(sk, struct mp_sock, sk)->mp = mp; > + > + sk->sk_destruct = mp_sock_destruct; > + sk->sk_data_ready = mp_sock_data_ready; > + sk->sk_write_space = mp_sock_write_space; > + > + ret = mp_attach(mp, file); > + if (ret < 0) > + goto err_free_sk; > + > + ret = page_ctor_attach(mp); > + if (ret < 0) > + goto err_free_sk; > + > + ifr.ifr_flags |= IFF_MPASSTHRU_EXCL; > + mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP); > +out: > + mutex_unlock(&mp_mutex); > + break; > +err_free_sk: > + sk_free(sk); > +err_free_mp: > + kfree(mp); > +err_dev_put: > + dev_put(dev); > + goto out; > + > + case MPASSTHRU_UNBINDDEV: > + ret = do_unbind(mfile); > + break; > + > + default: > + break; > + } > + return ret; > +} > + > +static unsigned int mp_chr_poll(struct file *file, poll_table * wait) > +{ > + struct mp_file *mfile = file->private_data; > + struct mp_struct *mp = mp_get(mfile); > + struct sock *sk; > + unsigned int mask = 0; > + > + if (!mp) > + return POLLERR; > + > + sk = mp->socket.sk; > + > + poll_wait(file, &mp->socket.wait, wait); > + > + if (!skb_queue_empty(&sk->sk_receive_queue)) > + mask |= POLLIN | POLLRDNORM; > + > + if (sock_writeable(sk) || > + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) && > + sock_writeable(sk))) > + mask |= POLLOUT | POLLWRNORM; > + > + if (mp->dev->reg_state != NETREG_REGISTERED) > + mask = POLLERR; > + > + mp_put(mfile); > + return mask; > +} > + > +static int mp_chr_close(struct inode *inode, struct file *file) > +{ > + struct mp_file *mfile = file->private_data; > + > + /* > + * Ignore return value since an error only means there was nothing to > + * do > + */ > + do_unbind(mfile); > + > + put_net(mfile->net); > + kfree(mfile); > + > + return 0; > +} > + > +static const struct file_operations mp_fops = { > + .owner = THIS_MODULE, > + .llseek = no_llseek, > + .poll = mp_chr_poll, > + .unlocked_ioctl = mp_chr_ioctl, > + .open = mp_chr_open, > + .release = mp_chr_close, > +}; > + > +static struct miscdevice mp_miscdev = { > + .minor = MISC_DYNAMIC_MINOR, > + .name = "mp", > + .nodename = "net/mp", > + .fops = &mp_fops, > +}; > + > +static int mp_device_event(struct notifier_block *unused, > + unsigned long event, void *ptr) > +{ > + struct net_device *dev = ptr; > + struct mpassthru_port *port; > + struct mp_struct *mp = NULL; > + struct socket *sock = NULL; > + > + port = dev->mp_port; > + if (port == NULL) > + return NOTIFY_DONE; > + > + switch (event) { > + case NETDEV_UNREGISTER: > + sock = dev->mp_port->sock; > + mp = container_of(sock->sk, struct mp_sock, sk)->mp; > + do_unbind(mp->mfile); > + break; > + } > + return NOTIFY_DONE; > +} > + > +static struct notifier_block mp_notifier_block __read_mostly = { > + .notifier_call = mp_device_event, > +}; > + > +static int mp_init(void) > +{ > + int ret = 0; > + > + ret = misc_register(&mp_miscdev); > + if (ret) > + printk(KERN_ERR "mp: Can't register misc device\n"); > + else { > + printk(KERN_INFO "Registering mp misc device - minor = %d\n", > + mp_miscdev.minor); > + register_netdevice_notifier(&mp_notifier_block); > + } > + return ret; > +} > + > +void mp_cleanup(void) > +{ > + unregister_netdevice_notifier(&mp_notifier_block); > + misc_deregister(&mp_miscdev); > +} > + > +/* Get an underlying socket object from mp file. Returns error unless file is > + * attached to a device. The returned object works like a packet socket, it > + * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for > + * holding a reference to the file for as long as the socket is in use. */ > +struct socket *mp_get_socket(struct file *file) > +{ > + struct mp_file *mfile = file->private_data; > + struct mp_struct *mp; > + > + if (file->f_op != &mp_fops) > + return ERR_PTR(-EINVAL); > + mp = mp_get(mfile); > + if (!mp) > + return ERR_PTR(-EBADFD); > + mp_put(mfile); > + return &mp->socket; > +} > +EXPORT_SYMBOL_GPL(mp_get_socket); > + > +module_init(mp_init); > +module_exit(mp_cleanup); > +MODULE_AUTHOR(DRV_COPYRIGHT); > +MODULE_DESCRIPTION(DRV_DESCRIPTION); > +MODULE_LICENSE("GPL v2"); > diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h > new file mode 100644 > index 0000000..2be21c5 > --- /dev/null > +++ b/include/linux/mpassthru.h > @@ -0,0 +1,29 @@ > +#ifndef __MPASSTHRU_H > +#define __MPASSTHRU_H > + > +#include <linux/types.h> > +#include <linux/if_ether.h> > + > +/* ioctl defines */ > +#define MPASSTHRU_BINDDEV _IOW('M', 213, int) > +#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int) > + > +/* MPASSTHRU ifc flags */ > +#define IFF_MPASSTHRU 0x0001 > +#define IFF_MPASSTHRU_EXCL 0x0002 > + > +#ifdef __KERNEL__ > +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) > +struct socket *mp_get_socket(struct file *); > +#else > +#include <linux/err.h> > +#include <linux/errno.h> > +struct file; > +struct socket; > +static inline struct socket *mp_get_socket(struct file *f) > +{ > + return ERR_PTR(-EINVAL); > +} > +#endif /* CONFIG_VHOST_PASSTHRU */ > +#endif /* __KERNEL__ */ > +#endif /* __MPASSTHRU_H */ > -- > 1.5.4.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html