apply the patch, and disable CONFIG_SCHED_DEBUG, the result is worse, the cpu rate of host os is 143%. CONFIG_WORKQUEUE_TRACER is already disable. 2010/11/24 Sridhar Samudrala <sri@xxxxxxxxxx>: > On 11/23/2010 5:41 AM, Michael S. Tsirkin wrote: >> >> On Tue, Nov 23, 2010 at 09:23:41PM +0800, lidong chen wrote: >>> >>> At this point, I'd suggest testing vhost-net on the upstream kernel, >>> not on rhel kernels. The change that introduced per-device threads is: >>> c23f3445e68e1db0e74099f264bc5ff5d55ebdeb >>> i will try this tomorrow. >>> >>> Is CONFIG_SCHED_DEBUG set? >>> yes. CONFIG_SCHED_DEBUG=y. >> >> Disable it. Either debug scheduler or perf-test it :) > > Another debug option to disable is CONFIG_WORKQUEUE_TRACER if it is set > when using old rhel6 kernels. > > -Sridhar > >>> 2010/11/23 Michael S. Tsirkin<mst@xxxxxxxxxx>: >>>> >>>> On Tue, Nov 23, 2010 at 10:13:43AM +0800, lidong chen wrote: >>>>> >>>>> I test the performance between per-vhost kthread disable and enable. >>>>> >>>>> Test method: >>>>> Send the same traffic load between per-vhost kthread disable and >>>>> enable, and compare the cpu rate of host os. >>>>> I run five vm on kvm, each of them have five nic. >>>>> the vhost version which per-vhost kthread disable we used is rhel6 >>>>> beta 2(2.6.32.60). >>>>> the vhost version which per-vhost kthread enable we used is rhel6 >>>>> (2.6.32-71). >>>> >>>> At this point, I'd suggest testing vhost-net on the upstream kernel, >>>> not on rhel kernels. The change that introduced per-device threads is: >>>> c23f3445e68e1db0e74099f264bc5ff5d55ebdeb >>>> >>>>> Test result: >>>>> with per-vhost kthread disable, the cpu rate of host os is 110%. >>>>> with per-vhost kthread enable, the cpu rate of host os is 130%. >>>> >>>> Is CONFIG_SCHED_DEBUG set? We are stressing the scheduler a lot with >>>> vhost-net. >>>> >>>>> In 2.6.32.60,the whole system only have a kthread. >>>>> [root@rhel6-kvm1 ~]# ps -ef | grep vhost >>>>> root 973 2 0 Nov22 ? 00:00:00 [vhost] >>>>> >>>>> In 2.6.32.71,the whole system have 25 kthread. >>>>> [root@kvm-4slot ~]# ps -ef | grep vhost- >>>>> root 12896 2 0 10:26 ? 00:00:00 [vhost-12842] >>>>> root 12897 2 0 10:26 ? 00:00:00 [vhost-12842] >>>>> root 12898 2 0 10:26 ? 00:00:00 [vhost-12842] >>>>> root 12899 2 0 10:26 ? 00:00:00 [vhost-12842] >>>>> root 12900 2 0 10:26 ? 00:00:00 [vhost-12842] >>>>> >>>>> root 13022 2 0 10:26 ? 00:00:00 [vhost-12981] >>>>> root 13023 2 0 10:26 ? 00:00:00 [vhost-12981] >>>>> root 13024 2 0 10:26 ? 00:00:00 [vhost-12981] >>>>> root 13025 2 0 10:26 ? 00:00:00 [vhost-12981] >>>>> root 13026 2 0 10:26 ? 00:00:00 [vhost-12981] >>>>> >>>>> root 13146 2 0 10:26 ? 00:00:00 [vhost-13088] >>>>> root 13147 2 0 10:26 ? 00:00:00 [vhost-13088] >>>>> root 13148 2 0 10:26 ? 00:00:00 [vhost-13088] >>>>> root 13149 2 0 10:26 ? 00:00:00 [vhost-13088] >>>>> root 13150 2 0 10:26 ? 00:00:00 [vhost-13088] >>>>> ... >>>>> >>>>> Code difference: >>>>> In 2.6.32.60,in function vhost_init, create the kthread for vhost. >>>>> vhost_workqueue = create_singlethread_workqueue("vhost"); >>>>> >>>>> In 2.6.32.71,in function vhost_dev_set_owner, create the kthread for >>>>> each nic interface. >>>>> dev->wq = create_singlethread_workqueue(vhost_name); >>>>> >>>>> Conclusion: >>>>> with per-vhost kthread enable, the system can more throughput. >>>>> but deal the same traffic load with per-vhost kthread enable, it waste >>>>> more cpu resource. >>>>> >>>>> In my application scene, the cpu resource is more important, and one >>>>> kthread for deal with traffic load is enough. >>>>> >>>>> So i think we should add a param to control this. >>>>> for the CPU-bound system, this param disable per-vhost kthread. >>>>> for the I/O-bound system, this param enable per-vhost kthread. >>>>> the default value of this param is enable. >>>>> >>>>> If my opinion is right, i will give a patch for this. >>>> >>>> Let's try to figure out what the issue is, first. >>>> >>>> -- >>>> MST >>>> > > >
#ifndef _VHOST_H #define _VHOST_H #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/poll.h> #include <linux/file.h> #include <linux/skbuff.h> #include <linux/uio.h> #include <linux/virtio_config.h> #include <linux/virtio_ring.h> #include <linux/virtio_net.h> #include <asm/atomic.h> struct vhost_device; struct vhost_work; typedef void (*vhost_work_fn_t)(struct vhost_work *work); struct vhost_work { struct list_head node; vhost_work_fn_t fn; wait_queue_head_t done; int flushing; unsigned queue_seq; unsigned done_seq; }; /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { poll_table table; wait_queue_head_t *wqh; wait_queue_t wait; /* struct which will handle all actual work. */ struct vhost_work work; unsigned long mask; struct vhost_dev *dev; }; void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t func, unsigned long mask, struct vhost_dev *dev); void vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); struct vhost_log { u64 addr; u64 len; }; /* The virtqueue structure describes a queue attached to a device. */ struct vhost_virtqueue { struct vhost_dev *dev; /* The actual ring of buffers. */ struct mutex mutex; unsigned int num; struct vring_desc __user *desc; struct vring_avail __user *avail; struct vring_used __user *used; struct file *kick; struct file *call; struct file *error; struct eventfd_ctx *call_ctx; struct eventfd_ctx *error_ctx; struct eventfd_ctx *log_ctx; struct vhost_poll poll; /* The routine to call when the Guest pings us, or timeout. */ vhost_work_fn_t handle_kick; /* Last available index we saw. */ u16 last_avail_idx; /* Caches available index value from user. */ u16 avail_idx; /* Last index we used. */ u16 last_used_idx; /* Used flags */ u16 used_flags; /* Log writes to used structure. */ bool log_used; u64 log_addr; struct iovec iov[UIO_MAXIOV]; /* hdr is used to store the virtio header. * Since each iovec entry has >= 1 byte length, we never need more than * header length entries to store the header. */ struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)]; struct iovec *indirect; size_t hdr_size; /* We use a kind of RCU to access private pointer. * All readers access it from workqueue, which makes it possible to * flush the workqueue instead of synchronize_rcu. Therefore readers do * not need to call rcu_read_lock/rcu_read_unlock: the beginning of * work item execution acts instead of rcu_read_lock() and the end of * work item execution acts instead of rcu_read_lock(). * Writers use virtqueue mutex. */ void *private_data; /* Log write descriptors */ void __user *log_base; struct vhost_log *log; }; struct vhost_dev { /* Readers use RCU to access memory table pointer * log base pointer and features. * Writers use mutex below.*/ struct vhost_memory *memory; struct mm_struct *mm; struct mutex mutex; unsigned acked_features; struct vhost_virtqueue *vqs; int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; spinlock_t work_lock; struct list_head work_list; struct task_struct *worker; }; long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); long vhost_dev_check_owner(struct vhost_dev *); long vhost_dev_reset_owner(struct vhost_dev *); void vhost_dev_cleanup(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); int vhost_vq_access_ok(struct vhost_virtqueue *vq); int vhost_log_access_ok(struct vhost_dev *); int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_count, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num); void vhost_discard_vq_desc(struct vhost_virtqueue *); int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, unsigned int head, int len); void vhost_disable_notify(struct vhost_virtqueue *); bool vhost_enable_notify(struct vhost_virtqueue *); int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); int vhost_init(void); void vhost_cleanup(void); #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \ eventfd_signal((vq)->error_ctx, 1);\ } while (0) enum { VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | (1 << VIRTIO_RING_F_INDIRECT_DESC) | (1 << VHOST_F_LOG_ALL) | (1 << VHOST_NET_F_VIRTIO_NET_HDR), }; static inline int vhost_has_feature(struct vhost_dev *dev, int bit) { unsigned acked_features = rcu_dereference(dev->acked_features); return acked_features & (1 << bit); } #endif
/* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@xxxxxxxxxx> * * This work is licensed under the terms of the GNU GPL, version 2. * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/mmu_context.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/rcupdate.h> #include <linux/file.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <net/sock.h> #include "vhost.h" /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; enum vhost_net_poll_state { VHOST_NET_POLL_DISABLED = 0, VHOST_NET_POLL_STARTED = 1, VHOST_NET_POLL_STOPPED = 2, }; struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Tells us whether we are polling a socket for TX. * We only do this when socket buffer fills up. * Protected by tx vq lock. */ enum vhost_net_poll_state tx_poll_state; }; /* Pop first len bytes from iovec. Return number of segments used. */ static int move_iovec_hdr(struct iovec *from, struct iovec *to, size_t len, int iov_count) { int seg = 0; size_t size; while (len && seg < iov_count) { size = min(from->iov_len, len); to->iov_base = from->iov_base; to->iov_len = size; from->iov_len -= size; from->iov_base += size; len -= size; ++from; ++to; ++seg; } return seg; } /* Caller must have TX VQ lock */ static void tx_poll_stop(struct vhost_net *net) { if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) return; vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); net->tx_poll_state = VHOST_NET_POLL_STOPPED; } /* Caller must have TX VQ lock */ static void tx_poll_start(struct vhost_net *net, struct socket *sock) { if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) return; vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); net->tx_poll_state = VHOST_NET_POLL_STARTED; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; unsigned out, in, s; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err, wmem; size_t hdr_size; struct socket *sock = rcu_dereference(vq->private_data); if (!sock) return; wmem = atomic_read(&sock->sk->sk_wmem_alloc); if (wmem >= sock->sk->sk_sndbuf) { mutex_lock(&vq->mutex); tx_poll_start(net, sock); mutex_unlock(&vq->mutex); return; } use_mm(net->dev.mm); mutex_lock(&vq->mutex); vhost_disable_notify(vq); if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->hdr_size; for (;;) { head = vhost_get_vq_desc(&net->dev, vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { wmem = atomic_read(&sock->sk->sk_wmem_alloc); if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { tx_poll_start(net, sock); set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } if (unlikely(vhost_enable_notify(vq))) { vhost_disable_notify(vq); continue; } break; } if (in) { vq_err(vq, "Unexpected descriptor format for TX: " "out %d, int %d\n", out, in); break; } /* Skip header. TODO: support TSO. */ s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); msg.msg_iovlen = out; len = iov_length(vq->iov, out); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", iov_length(vq->hdr, s), hdr_size); break; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); if (unlikely(err < 0)) { vhost_discard_vq_desc(vq); tx_poll_start(net, sock); break; } if (err != len) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); vhost_add_used_and_signal(&net->dev, vq, head, 0); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); break; } } mutex_unlock(&vq->mutex); unuse_mm(net->dev.mm); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; unsigned out, in, log, s; int head; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t len, total_len = 0; int err; size_t hdr_size; struct socket *sock = rcu_dereference(vq->private_data); if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) return; use_mm(net->dev.mm); mutex_lock(&vq->mutex); vhost_disable_notify(vq); hdr_size = vq->hdr_size; vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; for (;;) { head = vhost_get_vq_desc(&net->dev, vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, vq_log, &log); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* OK, now we need to know about added descriptors. */ if (head == vq->num) { if (unlikely(vhost_enable_notify(vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ break; } /* We don't need to be notified again. */ if (out) { vq_err(vq, "Unexpected descriptor format for RX: " "out %d, int %d\n", out, in); break; } /* Skip header. TODO: support TSO/mergeable rx buffers. */ s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); msg.msg_iovlen = in; len = iov_length(vq->iov, in); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for RX: " "%zd expected %zd\n", iov_length(vq->hdr, s), hdr_size); break; } err = sock->ops->recvmsg(NULL, sock, &msg, len, MSG_DONTWAIT | MSG_TRUNC); /* TODO: Check specific error and bomb out unless EAGAIN? */ if (err < 0) { vhost_discard_vq_desc(vq); break; } /* TODO: Should check and handle checksum. */ if (err > len) { pr_debug("Discarded truncated rx packet: " " len %d > %zd\n", err, len); vhost_discard_vq_desc(vq); continue; } len = err; err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); if (err) { vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", vq->iov->iov_base, err); break; } len += hdr_size; vhost_add_used_and_signal(&net->dev, vq, head, len); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, len); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); break; } } mutex_unlock(&vq->mutex); unuse_mm(net->dev.mm); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq; struct vhost_net *net; vq = container_of(work, struct vhost_virtqueue, poll.work); net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq; struct vhost_net *net; vq = container_of(work, struct vhost_virtqueue, poll.work); net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net; net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net; net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); int r; if (!n) return -ENOMEM; n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); if (r < 0) { kfree(n); return r; } vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, &n->dev); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, &n->dev); n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; return 0; } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { if (!vq->private_data) return; if (vq == n->vqs + VHOST_NET_VQ_TX) { tx_poll_stop(n); n->tx_poll_state = VHOST_NET_POLL_DISABLED; } else vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); } static void vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock = vq->private_data; if (!sock) return; if (vq == n->vqs + VHOST_NET_VQ_TX) { n->tx_poll_state = VHOST_NET_POLL_STOPPED; tx_poll_start(n, sock); } else vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; mutex_lock(&vq->mutex); sock = vq->private_data; vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, NULL); mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); } static void vhost_net_flush_vq(struct vhost_net *n, int index) { vhost_poll_flush(n->poll + index); vhost_poll_flush(&n->dev.vqs[index].poll); } static void vhost_net_flush(struct vhost_net *n) { vhost_net_flush_vq(n, VHOST_NET_VQ_TX); vhost_net_flush_vq(n, VHOST_NET_VQ_RX); } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_cleanup(&n->dev); if (tx_sock) fput(tx_sock->file); if (rx_sock) fput(rx_sock->file); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n); return 0; } static struct socket *get_raw_socket(int fd) { struct { struct sockaddr_ll sa; char buf[MAX_ADDR_LEN]; } uaddr; int uaddr_len = sizeof uaddr, r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, &uaddr_len, 0); if (r) goto err; if (uaddr.sa.sll_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: fput(sock->file); return ERR_PTR(r); } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = macvtap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = n->vqs + index; mutex_lock(&vq->mutex); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vq->private_data; if (sock == oldsock) goto done; vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); vhost_net_enable_vq(n, vq); done: mutex_unlock(&vq->mutex); if (oldsock) { vhost_net_flush_vq(n, index); fput(oldsock->file); } mutex_unlock(&n->dev.mutex); return 0; err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); err = vhost_dev_reset_owner(&n->dev); done: mutex_unlock(&n->dev.mutex); if (tx_sock) fput(tx_sock->file); if (rx_sock) fput(rx_sock->file); return err; } static int vhost_net_set_features(struct vhost_net *n, u64 features) { size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? sizeof(struct virtio_net_hdr) : 0; int i; mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&n->dev)) { mutex_unlock(&n->dev.mutex); return -EFAULT; } n->dev.acked_features = features; smp_wmb(); for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].mutex); n->vqs[i].hdr_size = hdr_size; mutex_unlock(&n->vqs[i].mutex); } vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return 0; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features; int r; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = VHOST_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, arg); vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } #ifdef CONFIG_COMPAT static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); } #endif const static struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .unlocked_ioctl = vhost_net_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vhost_net_compat_ioctl, #endif .open = vhost_net_open, }; static struct miscdevice vhost_net_misc = { VHOST_NET_MINOR, "vhost-net", &vhost_net_fops, }; int vhost_net_init(void) { int r = vhost_init(); if (r) goto err_init; r = misc_register(&vhost_net_misc); if (r) goto err_reg; return 0; err_reg: vhost_cleanup(); err_init: return r; } module_init(vhost_net_init); void vhost_net_exit(void) { misc_deregister(&vhost_net_misc); vhost_cleanup(); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
/* Copyright (C) 2009 Red Hat, Inc. * Copyright (C) 2006 Rusty Russell IBM Corporation * * Author: Michael S. Tsirkin <mst@xxxxxxxxxx> * * Inspiration, some code, and most witty comments come from * Documentation/lguest/lguest.c, by Rusty Russell * * This work is licensed under the terms of the GNU GPL, version 2. * * Generic code for virtio server in host kernel. */ #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/poll.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <net/sock.h> #include "vhost.h" enum { VHOST_MEMORY_MAX_NREGIONS = 64, VHOST_MEMORY_F_LOG = 0x1, }; static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct vhost_poll *poll; poll = container_of(pt, struct vhost_poll, table); poll->wqh = wqh; add_wait_queue(wqh, &poll->wait); } static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct vhost_poll *poll; poll = container_of(wait, struct vhost_poll, wait); if (!((unsigned long)key & poll->mask)) return 0; /* queue_work(poll->dev->wq, &poll->work); */ vhost_poll_queue(poll); return 0; } /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t func, unsigned long mask, struct vhost_dev *dev) { /* INIT_WORK(&poll->work, func); */ struct vhost_work *work = &poll->work; init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; poll->dev = dev; INIT_LIST_HEAD(&work->node); work->fn = func; init_waitqueue_head(&work->done); work->flushing = 0; work->queue_seq = work->done_seq = 0; } /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ void vhost_poll_start(struct vhost_poll *poll, struct file *file) { unsigned long mask; mask = file->f_op->poll(file, &poll->table); if (mask) vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); } /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ void vhost_poll_stop(struct vhost_poll *poll) { remove_wait_queue(poll->wqh, &poll->wait); } /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { /* flush_work(&poll->work); */ struct vhost_work *work = &poll->work; unsigned seq; int left; int flushing; spin_lock_irq(&poll->dev->work_lock); seq = work->queue_seq; work->flushing++; spin_unlock_irq(&poll->dev->work_lock); wait_event(work->done, ({ spin_lock_irq(&poll->dev->work_lock); left = seq - work->done_seq <= 0; spin_unlock_irq(&poll->dev->work_lock); left; })); spin_lock_irq(&poll->dev->work_lock); flushing = --work->flushing; spin_unlock_irq(&poll->dev->work_lock); BUG_ON(flushing < 0); } void vhost_poll_queue(struct vhost_poll *poll) { /* queue_work(poll->dev->wq, &poll->work); */ struct vhost_dev *dev = poll->dev; struct vhost_work *work = &poll->work; unsigned long flags; spin_lock_irqsave(&dev->work_lock, flags); if (list_empty(&work->node)) { list_add_tail(&work->node, &dev->work_list); work->queue_seq++; wake_up_process(dev->worker); } spin_unlock_irqrestore(&dev->work_lock, flags); } static void vhost_vq_reset(struct vhost_dev *dev, struct vhost_virtqueue *vq) { vq->num = 1; vq->desc = NULL; vq->avail = NULL; vq->used = NULL; vq->last_avail_idx = 0; vq->avail_idx = 0; vq->last_used_idx = 0; vq->used_flags = 0; vq->used_flags = 0; vq->log_used = false; vq->log_addr = -1ull; vq->hdr_size = 0; vq->private_data = NULL; vq->log_base = NULL; vq->error_ctx = NULL; vq->error = NULL; vq->kick = NULL; vq->call_ctx = NULL; vq->call = NULL; vq->log_ctx = NULL; } static int vhost_worker(void *data) { struct vhost_dev *dev = data; struct vhost_work *work = NULL; unsigned uninitialized_var(seq); for (;;) { /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); spin_lock_irq(&dev->work_lock); if (work) { work->done_seq = seq; if (work->flushing) wake_up_all(&work->done); } if (kthread_should_stop()) { spin_unlock_irq(&dev->work_lock); __set_current_state(TASK_RUNNING); return 0; } if (!list_empty(&dev->work_list)) { work = list_first_entry(&dev->work_list, struct vhost_work, node); list_del_init(&work->node); seq = work->queue_seq; } else work = NULL; spin_unlock_irq(&dev->work_lock); if (work) { __set_current_state(TASK_RUNNING); work->fn(work); } else schedule(); } } /* Helper to allocate iovec buffers for all vqs. */ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { int i; for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].indirect = kmalloc(GFP_KERNEL, UIO_MAXIOV * sizeof *dev->vqs[i].indirect); dev->vqs[i].log = kmalloc(GFP_KERNEL, UIO_MAXIOV * sizeof *dev->vqs[i].log); if (!dev->vqs[i].indirect || !dev->vqs[i].log) goto err_nomem; } return 0; err_nomem: for (; i >= 0; --i) { kfree(dev->vqs[i].indirect); kfree(dev->vqs[i].log); } return -ENOMEM; } long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue *vqs, int nvqs) { int i, ret; dev->vqs = vqs; dev->nvqs = nvqs; mutex_init(&dev->mutex); dev->log_ctx = NULL; dev->log_file = NULL; dev->memory = NULL; dev->mm = NULL; //dev->wq = NULL; spin_lock_init(&dev->work_lock); INIT_LIST_HEAD(&dev->work_list); dev->worker = NULL; ret = vhost_dev_alloc_iovecs(dev); if (ret) return ret; for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].dev = dev; mutex_init(&dev->vqs[i].mutex); vhost_vq_reset(dev, dev->vqs + i); if (dev->vqs[i].handle_kick) vhost_poll_init(&dev->vqs[i].poll, dev->vqs[i].handle_kick, POLLIN, dev); } return 0; } /* Caller should have device mutex */ long vhost_dev_check_owner(struct vhost_dev *dev) { /* Are you the owner? If not, I don't think you mean to do that */ return dev->mm == current->mm ? 0 : -EPERM; } /* struct vhost_attach_cgroups_struct { struct work_struct work; struct task_struct *owner; int ret; }; static void vhost_attach_cgroups_work(struct work_struct *work) { struct vhost_attach_cgroups_struct *s; s = container_of(work, struct vhost_attach_cgroups_struct, work); s->ret = cgroup_attach_task_all(s->owner, current); } static int vhost_attach_cgroups(struct workqueue_struct *wq) { struct vhost_attach_cgroups_struct attach; attach.owner = current; INIT_WORK(&attach.work, vhost_attach_cgroups_work); queue_work(wq, &attach.work); flush_work(&attach.work); return attach.ret; } */ /* Caller should have device mutex */ static long vhost_dev_set_owner(struct vhost_dev *dev) { struct task_struct *worker; int err; /* Is there an owner already? */ if (dev->mm) { err = -EBUSY; goto err; } /* No owner, become one */ dev->mm = get_task_mm(current); /* Initialize the workqueue. */ //snprintf(vhost_name, sizeof vhost_name, "vhost-%d", current->pid); //dev->wq = create_singlethread_workqueue(vhost_name); //if (!dev->wq) { // err = -ENOMEM; // goto err_wq; //} worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); if (IS_ERR(worker)) { err = PTR_ERR(worker); goto err_worker; } dev->worker = worker; wake_up_process(worker); /* err = vhost_attach_cgroups(dev->wq); if (err) goto err_cgroups; */ return 0; /* err_cgroups: destroy_workqueue(dev->wq); dev->wq = NULL; */ err_worker: if (dev->mm) mmput(dev->mm); dev->mm = NULL; err: return err; } /* Caller should have device mutex */ long vhost_dev_reset_owner(struct vhost_dev *dev) { struct vhost_memory *memory; /* Restore memory to default empty mapping. */ memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); if (!memory) return -ENOMEM; vhost_dev_cleanup(dev); memory->nregions = 0; dev->memory = memory; return 0; } /* Caller should have device mutex */ void vhost_dev_cleanup(struct vhost_dev *dev) { int i; for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { vhost_poll_stop(&dev->vqs[i].poll); vhost_poll_flush(&dev->vqs[i].poll); } if (dev->vqs[i].error_ctx) eventfd_ctx_put(dev->vqs[i].error_ctx); if (dev->vqs[i].error) fput(dev->vqs[i].error); if (dev->vqs[i].kick) fput(dev->vqs[i].kick); if (dev->vqs[i].call_ctx) eventfd_ctx_put(dev->vqs[i].call_ctx); if (dev->vqs[i].call) fput(dev->vqs[i].call); vhost_vq_reset(dev, dev->vqs + i); kfree(dev->vqs[i].indirect); kfree(dev->vqs[i].log); } if (dev->log_ctx) eventfd_ctx_put(dev->log_ctx); dev->log_ctx = NULL; if (dev->log_file) fput(dev->log_file); dev->log_file = NULL; /* No one will access memory at this point */ kfree(dev->memory); dev->memory = NULL; if (dev->mm) mmput(dev->mm); dev->mm = NULL; /* if (dev->wq) destroy_workqueue(dev->wq); dev->wq = NULL; */ WARN_ON(!list_empty(&dev->work_list)); kthread_stop(dev->worker); } static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) return -EFAULT; return access_ok(VERIFY_WRITE, log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); } /* Caller should have vq mutex and device mutex. */ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem, int log_all) { int i; if (!mem) return 0; for (i = 0; i < mem->nregions; ++i) { struct vhost_memory_region *m = mem->regions + i; unsigned long a = m->userspace_addr; if (m->memory_size > ULONG_MAX) return 0; else if (!access_ok(VERIFY_WRITE, (void __user *)a, m->memory_size)) return 0; else if (log_all && !log_access_ok(log_base, m->guest_phys_addr, m->memory_size)) return 0; } return 1; } /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, int log_all) { int i; for (i = 0; i < d->nvqs; ++i) { int ok; mutex_lock(&d->vqs[i].mutex); /* If ring is inactive, will check when it's enabled. */ if (d->vqs[i].private_data) ok = vq_memory_access_ok(d->vqs[i].log_base, mem, log_all); else ok = 1; mutex_unlock(&d->vqs[i].mutex); if (!ok) return 0; } return 1; } static int vq_access_ok(unsigned int num, struct vring_desc __user *desc, struct vring_avail __user *avail, struct vring_used __user *used) { return access_ok(VERIFY_READ, desc, num * sizeof *desc) && access_ok(VERIFY_READ, avail, sizeof *avail + num * sizeof *avail->ring) && access_ok(VERIFY_WRITE, used, sizeof *used + num * sizeof *used->ring); } /* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ int vhost_log_access_ok(struct vhost_dev *dev) { return memory_access_ok(dev, dev->memory, 1); } /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) { return vq_memory_access_ok(log_base, vq->dev->memory, vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && (!vq->log_used || log_access_ok(log_base, vq->log_addr, sizeof *vq->used + vq->num * sizeof *vq->used->ring)); } /* Can we start vq? */ /* Caller should have vq mutex and device mutex */ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { return vq_access_ok(vq->num, vq->desc, vq->avail, vq->used) && vq_log_access_ok(vq, vq->log_base); } static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem, *oldmem; unsigned long size = offsetof(struct vhost_memory, regions); if (copy_from_user(&mem, m, size)) return -EFAULT; if (mem.padding) return -EOPNOTSUPP; if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) return -E2BIG; newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); if (!newmem) return -ENOMEM; memcpy(newmem, &mem, size); if (copy_from_user(newmem->regions, m->regions, mem.nregions * sizeof *m->regions)) { kfree(newmem); return -EFAULT; } if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) { kfree(newmem); return -EFAULT; } oldmem = d->memory; rcu_assign_pointer(d->memory, newmem); synchronize_rcu(); kfree(oldmem); return 0; } static int init_used(struct vhost_virtqueue *vq, struct vring_used __user *used) { int r = put_user(vq->used_flags, &used->flags); if (r) return r; return get_user(vq->last_used_idx, &used->idx); } static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) { struct file *eventfp, *filep = NULL, *pollstart = NULL, *pollstop = NULL; struct eventfd_ctx *ctx = NULL; u32 __user *idxp = argp; struct vhost_virtqueue *vq; struct vhost_vring_state s; struct vhost_vring_file f; struct vhost_vring_addr a; u32 idx; long r; r = get_user(idx, idxp); if (r < 0) return r; if (idx >= d->nvqs) return -ENOBUFS; vq = d->vqs + idx; mutex_lock(&vq->mutex); switch (ioctl) { case VHOST_SET_VRING_NUM: /* Resizing ring with an active backend? * You don't want to do that. */ if (vq->private_data) { r = -EBUSY; break; } if (copy_from_user(&s, argp, sizeof s)) { r = -EFAULT; break; } if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { r = -EINVAL; break; } vq->num = s.num; break; case VHOST_SET_VRING_BASE: /* Moving base with an active backend? * You don't want to do that. */ if (vq->private_data) { r = -EBUSY; break; } if (copy_from_user(&s, argp, sizeof s)) { r = -EFAULT; break; } if (s.num > 0xffff) { r = -EINVAL; break; } vq->last_avail_idx = s.num; /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; case VHOST_GET_VRING_BASE: s.index = idx; s.num = vq->last_avail_idx; if (copy_to_user(argp, &s, sizeof s)) r = -EFAULT; break; case VHOST_SET_VRING_ADDR: if (copy_from_user(&a, argp, sizeof a)) { r = -EFAULT; break; } if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { r = -EOPNOTSUPP; break; } /* For 32bit, verify that the top 32bits of the user data are set to zero. */ if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || (u64)(unsigned long)a.used_user_addr != a.used_user_addr || (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { r = -EFAULT; break; } if ((a.avail_user_addr & (sizeof *vq->avail->ring - 1)) || (a.used_user_addr & (sizeof *vq->used->ring - 1)) || (a.log_guest_addr & (sizeof *vq->used->ring - 1))) { r = -EINVAL; break; } /* We only verify access here if backend is configured. * If it is not, we don't as size might not have been setup. * We will verify when backend is configured. */ if (vq->private_data) { if (!vq_access_ok(vq->num, (void __user *)(unsigned long)a.desc_user_addr, (void __user *)(unsigned long)a.avail_user_addr, (void __user *)(unsigned long)a.used_user_addr)) { r = -EINVAL; break; } /* Also validate log access for used ring if enabled. */ if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && !log_access_ok(vq->log_base, a.log_guest_addr, sizeof *vq->used + vq->num * sizeof *vq->used->ring)) { r = -EINVAL; break; } } r = init_used(vq, (struct vring_used __user *)(unsigned long) a.used_user_addr); if (r) break; vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); vq->desc = (void __user *)(unsigned long)a.desc_user_addr; vq->avail = (void __user *)(unsigned long)a.avail_user_addr; vq->log_addr = a.log_guest_addr; vq->used = (void __user *)(unsigned long)a.used_user_addr; break; case VHOST_SET_VRING_KICK: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->kick) { pollstop = filep = vq->kick; pollstart = vq->kick = eventfp; } else filep = eventfp; break; case VHOST_SET_VRING_CALL: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->call) { filep = vq->call; ctx = vq->call_ctx; vq->call = eventfp; vq->call_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; break; case VHOST_SET_VRING_ERR: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->error) { filep = vq->error; vq->error = eventfp; ctx = vq->error_ctx; vq->error_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; break; default: r = -ENOIOCTLCMD; } if (pollstop && vq->handle_kick) vhost_poll_stop(&vq->poll); if (ctx) eventfd_ctx_put(ctx); if (filep) fput(filep); if (pollstart && vq->handle_kick) vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_poll_flush(&vq->poll); return r; } /* Caller must have device mutex */ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; struct file *eventfp, *filep = NULL; struct eventfd_ctx *ctx = NULL; u64 p; long r; int i, fd; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { r = vhost_dev_set_owner(d); goto done; } /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) goto done; switch (ioctl) { case VHOST_SET_MEM_TABLE: r = vhost_set_memory(d, argp); break; case VHOST_SET_LOG_BASE: if (copy_from_user(&p, argp, sizeof p)) { r = -EFAULT; break; } if ((u64)(unsigned long)p != p) { r = -EFAULT; break; } for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; vq = d->vqs + i; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(vq, base)) r = -EFAULT; else vq->log_base = base; mutex_unlock(&vq->mutex); } break; case VHOST_SET_LOG_FD: r = get_user(fd, (int __user *)argp); if (r < 0) break; eventfp = fd == -1 ? NULL : eventfd_fget(fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != d->log_file) { filep = d->log_file; ctx = d->log_ctx; d->log_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; for (i = 0; i < d->nvqs; ++i) { mutex_lock(&d->vqs[i].mutex); d->vqs[i].log_ctx = d->log_ctx; mutex_unlock(&d->vqs[i].mutex); } if (ctx) eventfd_ctx_put(ctx); if (filep) fput(filep); break; default: r = vhost_set_vring(d, ioctl, argp); break; } done: return r; } static const struct vhost_memory_region *find_region(struct vhost_memory *mem, __u64 addr, __u32 len) { struct vhost_memory_region *reg; int i; /* linear search is not brilliant, but we really have on the order of 6 * regions in practice */ for (i = 0; i < mem->nregions; ++i) { reg = mem->regions + i; if (reg->guest_phys_addr <= addr && reg->guest_phys_addr + reg->memory_size - 1 >= addr) return reg; } return NULL; } /* TODO: This is really inefficient. We need something like get_user() * (instruction directly accesses the data, with an exception table entry * returning -EFAULT). See Documentation/x86/exception-tables.txt. */ static int set_bit_to_user(int nr, void __user *addr) { unsigned long log = (unsigned long)addr; struct page *page; void *base; int bit = nr + (log % PAGE_SIZE) * 8; int r; r = get_user_pages_fast(log, 1, 1, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page, KM_USER0); set_bit(bit, base); kunmap_atomic(base, KM_USER0); set_page_dirty_lock(page); put_page(page); return 0; } static int log_write(void __user *log_base, u64 write_address, u64 write_length) { int r; if (!write_length) return 0; write_address /= VHOST_PAGE_SIZE; for (;;) { u64 base = (u64)(unsigned long)log_base; u64 log = base + write_address / 8; int bit = write_address % 8; if ((u64)(unsigned long)log != log) return -EFAULT; r = set_bit_to_user(bit, (void __user *)(unsigned long)log); if (r < 0) return r; if (write_length <= VHOST_PAGE_SIZE) break; write_length -= VHOST_PAGE_SIZE; write_address += VHOST_PAGE_SIZE; } return r; } int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len) { int i, r; /* Make sure data written is seen before log. */ smp_wmb(); for (i = 0; i < log_num; ++i) { u64 l = min(log[i].len, len); r = log_write(vq->log_base, log[i].addr, l); if (r < 0) return r; len -= l; if (!len) return 0; } if (vq->log_ctx) eventfd_signal(vq->log_ctx, 1); /* Length written exceeds what we have stored. This is a bug. */ BUG(); return 0; } int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, struct iovec iov[], int iov_size) { const struct vhost_memory_region *reg; struct vhost_memory *mem; struct iovec *_iov; u64 s = 0; int ret = 0; rcu_read_lock(); mem = rcu_dereference(dev->memory); while ((u64)len > s) { u64 size; if (unlikely(ret >= iov_size)) { ret = -ENOBUFS; break; } reg = find_region(mem, addr, len); if (unlikely(!reg)) { ret = -EFAULT; break; } _iov = iov + ret; size = reg->memory_size - addr + reg->guest_phys_addr; _iov->iov_len = min((u64)len, size); _iov->iov_base = (void *)(unsigned long) (reg->userspace_addr + addr - reg->guest_phys_addr); s += size; addr += size; ++ret; } rcu_read_unlock(); return ret; } /* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, * or -1U if we're at the end. */ static unsigned next_desc(struct vring_desc *desc) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ if (!(desc->flags & VRING_DESC_F_NEXT)) return -1U; /* Check they're not leading us off end of descriptors. */ next = desc->next; /* Make sure compiler knows to grab that: we don't want it changing! */ /* We will use the result as an index in an array, so most * architectures only need a compiler barrier here. */ read_barrier_depends(); return next; } static int get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num, struct vring_desc *indirect) { struct vring_desc desc; unsigned int i = 0, count, found = 0; int ret; /* Sanity check */ if (unlikely(indirect->len % sizeof desc)) { vq_err(vq, "Invalid length in indirect descriptor: " "len 0x%llx not multiple of 0x%zx\n", (unsigned long long)indirect->len, sizeof desc); return -EINVAL; } ret = translate_desc(dev, indirect->addr, indirect->len, vq->indirect, UIO_MAXIOV); if (unlikely(ret < 0)) { vq_err(vq, "Translation failure %d in indirect.\n", ret); return ret; } /* We will use the result as an address to read from, so most * architectures only need a compiler barrier here. */ read_barrier_depends(); count = indirect->len / sizeof desc; /* Buffers are chained via a 16 bit next field, so * we can have at most 2^16 of these. */ if (unlikely(count > USHORT_MAX + 1)) { vq_err(vq, "Indirect buffer length too big: %d\n", indirect->len); return -E2BIG; } do { unsigned iov_count = *in_num + *out_num; if (unlikely(++found > count)) { vq_err(vq, "Loop detected: last one at %u " "indirect size %u\n", i, count); return -EINVAL; } if (unlikely(memcpy_fromiovec((unsigned char *)&desc, vq->indirect, sizeof desc))) { vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", i, (size_t)indirect->addr + i * sizeof desc); return -EINVAL; } if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", i, (size_t)indirect->addr + i * sizeof desc); return -EINVAL; } ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, iov_size - iov_count); if (unlikely(ret < 0)) { vq_err(vq, "Translation failure %d indirect idx %d\n", ret, i); return ret; } /* If this is an input descriptor, increment that count. */ if (desc.flags & VRING_DESC_F_WRITE) { *in_num += ret; if (unlikely(log)) { log[*log_num].addr = desc.addr; log[*log_num].len = desc.len; ++*log_num; } } else { /* If it's an output descriptor, they're all supposed * to come before any input descriptors. */ if (unlikely(*in_num)) { vq_err(vq, "Indirect descriptor " "has out after in: idx %d\n", i); return -EINVAL; } *out_num += ret; } } while ((i = next_desc(&desc)) != -1); return 0; } /* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * * This function returns the descriptor number found, or vq->num (which is * never a valid descriptor number) if none was found. A negative code is * returned on error. */ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num) { struct vring_desc desc; unsigned int i, head, found = 0; u16 last_avail_idx; int ret; /* Check it isn't doing very strange things with descriptor numbers. */ last_avail_idx = vq->last_avail_idx; if (unlikely(get_user(vq->avail_idx, &vq->avail->idx))) { vq_err(vq, "Failed to access avail idx at %p\n", &vq->avail->idx); return -EFAULT; } if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { vq_err(vq, "Guest moved used index from %u to %u", last_avail_idx, vq->avail_idx); return -EFAULT; } /* If there's nothing new since last we looked, return invalid. */ if (vq->avail_idx == last_avail_idx) return vq->num; /* Only get avail ring entries after they have been exposed by guest. */ smp_rmb(); /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ if (unlikely(get_user(head, &vq->avail->ring[last_avail_idx % vq->num]))) { vq_err(vq, "Failed to read head: idx %d address %p\n", last_avail_idx, &vq->avail->ring[last_avail_idx % vq->num]); return -EFAULT; } /* If their number is silly, that's an error. */ if (unlikely(head >= vq->num)) { vq_err(vq, "Guest says index %u > %u is available", head, vq->num); return -EINVAL; } /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; if (unlikely(log)) *log_num = 0; i = head; do { unsigned iov_count = *in_num + *out_num; if (unlikely(i >= vq->num)) { vq_err(vq, "Desc index is %u > %u, head = %u", i, vq->num, head); return -EINVAL; } if (unlikely(++found > vq->num)) { vq_err(vq, "Loop detected: last one at %u " "vq size %u head %u\n", i, vq->num, head); return -EINVAL; } ret = copy_from_user(&desc, vq->desc + i, sizeof desc); if (unlikely(ret)) { vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", i, vq->desc + i); return -EFAULT; } if (desc.flags & VRING_DESC_F_INDIRECT) { ret = get_indirect(dev, vq, iov, iov_size, out_num, in_num, log, log_num, &desc); if (unlikely(ret < 0)) { vq_err(vq, "Failure detected " "in indirect descriptor at idx %d\n", i); return ret; } continue; } ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, iov_size - iov_count); if (unlikely(ret < 0)) { vq_err(vq, "Translation failure %d descriptor idx %d\n", ret, i); return ret; } if (desc.flags & VRING_DESC_F_WRITE) { /* If this is an input descriptor, * increment that count. */ *in_num += ret; if (unlikely(log)) { log[*log_num].addr = desc.addr; log[*log_num].len = desc.len; ++*log_num; } } else { /* If it's an output descriptor, they're all supposed * to come before any input descriptors. */ if (unlikely(*in_num)) { vq_err(vq, "Descriptor has out after in: " "idx %d\n", i); return -EINVAL; } *out_num += ret; } } while ((i = next_desc(&desc)) != -1); /* On success, increment avail index. */ vq->last_avail_idx++; return head; } /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ void vhost_discard_vq_desc(struct vhost_virtqueue *vq) { vq->last_avail_idx--; } /* After we've used one of their buffers, we tell them about it. We'll then * want to notify the guest, using eventfd. */ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) { struct vring_used_elem *used; /* The virtqueue contains a ring of used buffers. Get a pointer to the * next entry in that used ring. */ used = &vq->used->ring[vq->last_used_idx % vq->num]; if (put_user(head, &used->id)) { vq_err(vq, "Failed to write used id"); return -EFAULT; } if (put_user(len, &used->len)) { vq_err(vq, "Failed to write used len"); return -EFAULT; } /* Make sure buffer is written before we update index. */ smp_wmb(); if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { vq_err(vq, "Failed to increment used idx"); return -EFAULT; } if (unlikely(vq->log_used)) { /* Make sure data is seen before log. */ smp_wmb(); /* Log used ring entry write. */ log_write(vq->log_base, vq->log_addr + ((void *)used - (void *)vq->used), sizeof *used); /* Log used index update. */ log_write(vq->log_base, vq->log_addr + offsetof(struct vring_used, idx), sizeof vq->used->idx); if (vq->log_ctx) eventfd_signal(vq->log_ctx, 1); } vq->last_used_idx++; return 0; } /* This actually signals the guest, using eventfd. */ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { __u16 flags; /* Flush out used index updates. This is paired * with the barrier that the Guest executes when enabling * interrupts. */ smp_mb(); if (get_user(flags, &vq->avail->flags)) { vq_err(vq, "Failed to get flags"); return; } /* If they don't want an interrupt, don't signal, unless empty. */ if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && (vq->avail_idx != vq->last_avail_idx || !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY))) return; /* Signal the Guest tell them we used something up. */ if (vq->call_ctx) eventfd_signal(vq->call_ctx, 1); } /* And here's the combo meal deal. Supersize me! */ void vhost_add_used_and_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, unsigned int head, int len) { vhost_add_used(vq, head, len); vhost_signal(dev, vq); } /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_virtqueue *vq) { u16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) return false; vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; r = put_user(vq->used_flags, &vq->used->flags); if (r) { vq_err(vq, "Failed to enable notification at %p: %d\n", &vq->used->flags, r); return false; } /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); r = get_user(avail_idx, &vq->avail->idx); if (r) { vq_err(vq, "Failed to check avail idx at %p: %d\n", &vq->avail->idx, r); return false; } return avail_idx != vq->last_avail_idx; } /* We don't need to be notified again. */ void vhost_disable_notify(struct vhost_virtqueue *vq) { int r; if (vq->used_flags & VRING_USED_F_NO_NOTIFY) return; vq->used_flags |= VRING_USED_F_NO_NOTIFY; r = put_user(vq->used_flags, &vq->used->flags); if (r) vq_err(vq, "Failed to enable notification at %p: %d\n", &vq->used->flags, r); } int vhost_init(void) { return 0; } void vhost_cleanup(void) { }