On 07/21/2014 09:23 PM, Razya Ladelsky wrote: > Hello All, > > When vhost is waiting for buffers from the guest driver (e.g., more > packets > to send in vhost-net's transmit queue), it normally goes to sleep and > waits > for the guest to "kick" it. This kick involves a PIO in the guest, and > therefore an exit (and possibly userspace involvement in translating this > PIO > exit into a file descriptor event), all of which hurts performance. > > If the system is under-utilized (has cpu time to spare), vhost can > continuously poll the virtqueues for new buffers, and avoid asking > the guest to kick us. > This patch adds an optional polling mode to vhost, that can be enabled > via a kernel module parameter, "poll_start_rate". > > When polling is active for a virtqueue, the guest is asked to > disable notification (kicks), and the worker thread continuously checks > for > new buffers. When it does discover new buffers, it simulates a "kick" by > invoking the underlying backend driver (such as vhost-net), which thinks > it > got a real kick from the guest, and acts accordingly. If the underlying > driver asks not to be kicked, we disable polling on this virtqueue. > > We start polling on a virtqueue when we notice it has > work to do. Polling on this virtqueue is later disabled after 3 seconds of > polling turning up no new work, as in this case we are better off > returning > to the exit-based notification mechanism. The default timeout of 3 seconds > can be changed with the "poll_stop_idle" kernel module parameter. > > This polling approach makes lot of sense for new HW with posted-interrupts > for which we have exitless host-to-guest notifications. But even with > support > for posted interrupts, guest-to-host communication still causes exits. > Polling adds the missing part. > > When systems are overloaded, there won?t be enough cpu time for the > various > vhost threads to poll their guests' devices. For these scenarios, we plan > to add support for vhost threads that can be shared by multiple devices, > even of multiple vms. > Our ultimate goal is to implement the I/O acceleration features described > in: > KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) > https://www.youtube.com/watch?v=9EyweibHfEs > and > https://www.mail-archive.com/kvm@xxxxxxxxxxxxxxx/msg98179.html > > > Comments are welcome, > Thank you, > Razya Thanks for the work. Do you have perf numbers for this? And looks like the patch only poll for virtqueue. In the future, may worth to add callbacks for vhost_net to poll socket. Then it could be used with rx busy polling in host which may speedup the rx also. > > From: Razya Ladelsky <razya@xxxxxxxxxx> > > Add an optional polling mode to continuously poll the virtqueues > for new buffers, and avoid asking the guest to kick us. > > Signed-off-by: Razya Ladelsky <razya@xxxxxxxxxx> > --- > drivers/vhost/net.c | 6 +- > drivers/vhost/scsi.c | 5 +- > drivers/vhost/vhost.c | 247 > +++++++++++++++++++++++++++++++++++++++++++++++-- > drivers/vhost/vhost.h | 37 +++++++- > 4 files changed, 277 insertions(+), 18 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 971a760..558aecb 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -742,8 +742,10 @@ static int vhost_net_open(struct inode *inode, struct > file *f) > } > vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); > > - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, > dev); > - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, > dev); > + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, > + vqs[VHOST_NET_VQ_TX]); > + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, > + vqs[VHOST_NET_VQ_RX]); > > f->private_data = n; > > diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c > index 4f4ffa4..56f0233 100644 > --- a/drivers/vhost/scsi.c > +++ b/drivers/vhost/scsi.c > @@ -1528,9 +1528,8 @@ static int vhost_scsi_open(struct inode *inode, > struct file *f) > if (!vqs) > goto err_vqs; > > - vhost_work_init(&vs->vs_completion_work, > vhost_scsi_complete_cmd_work); > - vhost_work_init(&vs->vs_event_work, tcm_vhost_evt_work); > - > + vhost_work_init(&vs->vs_completion_work, NULL, > vhost_scsi_complete_cmd_work); > + vhost_work_init(&vs->vs_event_work, NULL, tcm_vhost_evt_work); > vs->vs_events_nr = 0; > vs->vs_events_missed = false; > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index c90f437..678d766 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -24,9 +24,17 @@ > #include <linux/slab.h> > #include <linux/kthread.h> > #include <linux/cgroup.h> > +#include <linux/jiffies.h> > #include <linux/module.h> > > #include "vhost.h" > +static int poll_start_rate = 0; > +module_param(poll_start_rate, int, S_IRUGO|S_IWUSR); > +MODULE_PARM_DESC(poll_start_rate, "Start continuous polling of virtqueue > when rate of events is at least this number per jiffy. If 0, never start > polling."); > + > +static int poll_stop_idle = 3*HZ; /* 3 seconds */ > +module_param(poll_stop_idle, int, S_IRUGO|S_IWUSR); > +MODULE_PARM_DESC(poll_stop_idle, "Stop continuous polling of virtqueue > after this many jiffies of no work."); > I'm not sure using jiffy is good enough since user need know HZ value. May worth to look at sk_busy_loop() which use sched_clock() and us. > enum { > VHOST_MEMORY_MAX_NREGIONS = 64, > @@ -58,27 +66,27 @@ static int vhost_poll_wakeup(wait_queue_t *wait, > unsigned mode, int sync, > return 0; > } > > -void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) > +void vhost_work_init(struct vhost_work *work, struct vhost_virtqueue *vq, > vhost_work_fn_t fn) > { > INIT_LIST_HEAD(&work->node); > work->fn = fn; > init_waitqueue_head(&work->done); > work->flushing = 0; > work->queue_seq = work->done_seq = 0; > + work->vq = vq; > } > EXPORT_SYMBOL_GPL(vhost_work_init); > > /* Init poll structure */ > void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, > - unsigned long mask, struct vhost_dev *dev) > + unsigned long mask, struct vhost_virtqueue *vq) > { > init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); > init_poll_funcptr(&poll->table, vhost_poll_func); > poll->mask = mask; > - poll->dev = dev; > + poll->dev = vq->dev; > poll->wqh = NULL; > - > - vhost_work_init(&poll->work, fn); > + vhost_work_init(&poll->work, vq, fn); > } > EXPORT_SYMBOL_GPL(vhost_poll_init); > > @@ -174,6 +182,86 @@ void vhost_poll_queue(struct vhost_poll *poll) > } > EXPORT_SYMBOL_GPL(vhost_poll_queue); > > +/* Enable or disable virtqueue polling (vqpoll.enabled) for a virtqueue. > + * > + * Enabling this mode it tells the guest not to notify ("kick") us when > its > + * has made more work available on this virtqueue; Rather, we will > continuously > + * poll this virtqueue in the worker thread. If multiple virtqueues are > polled, > + * the worker thread polls them all, e.g., in a round-robin fashion. > + * Note that vqpoll.enabled doesn't always mean that this virtqueue is > + * actually being polled: The backend (e.g., net.c) may temporarily > disable it > + * using vhost_disable/enable_notify(), while vqpoll.enabled is > unchanged. > + * > + * It is assumed that these functions are called relatively rarely, when > vhost > + * notices that this virtqueue's usage pattern significantly changed in a > way > + * that makes polling more efficient than notification, or vice versa. > + * Also, we assume that vhost_vq_disable_vqpoll() is always called on vq > + * cleanup, so any allocations done by vhost_vq_enable_vqpoll() can be > + * reclaimed. > + */ > +static void vhost_vq_enable_vqpoll(struct vhost_virtqueue *vq) > +{ > + if (vq->vqpoll.enabled) > + return; /* already enabled, nothing to do */ > + if (!vq->handle_kick) > + return; /* polling will be a waste of time if no callback! > */ > + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) { > + /* vq has guest notifications enabled. Disable them, > + and instead add vq to the polling list */ > > + list_add_tail(&vq->vqpoll.link, &vq->dev->vqpoll_list); This may work when there're at most two vqs in the list. But consider you may want to poll a lot of vqs in the future, it may take a long time for this vq to get polled. So probably we can just keep the used_flags untouched, if the vq get kicked, it can be served soon. > + } > + vq->vqpoll.jiffies_last_kick = jiffies; > + __get_user(vq->avail_idx, &vq->avail->idx); > + vq->vqpoll.enabled = true; > + > + /* Map userspace's vq->avail to the kernel's memory space. */ > + if (get_user_pages_fast((unsigned long)vq->avail, 1, 0, > + &vq->vqpoll.avail_page) != 1) { > + /* TODO: can this happen, as we check access > + to vq->avail in advance? */ > + BUG(); > + } > + vq->vqpoll.avail_mapped = (struct vring_avail *) ( > + (unsigned long)kmap(vq->vqpoll.avail_page) | > + ((unsigned long)vq->avail & ~PAGE_MASK)); Is it a must to map avail page here? Isn't it enough to do __get_user(vq->avail_idx, &vq->avail->idx) and then compare it with vq->last_avail_idx in roundrobin_poll() ? > +} > + > +/* > + * This function doesn't always succeed in changing the mode. Sometimes > + * a temporary race condition prevents turning on guest notifications, so > + * vq should be polled next time again. > + */ > +static void vhost_vq_disable_vqpoll(struct vhost_virtqueue *vq) > +{ > + if (!vq->vqpoll.enabled) { > + return; /* already disabled, nothing to do */ > + } > + vq->vqpoll.enabled = false; > + > + if (!list_empty(&vq->vqpoll.link)) { > + /* vq is on the polling list, remove it from this list and > + * instead enable guest notifications. */ > + list_del_init(&vq->vqpoll.link); > + if (unlikely(vhost_enable_notify(vq->dev,vq)) > + && !vq->vqpoll.shutdown) { > + /* Race condition: guest wrote before we enabled > + * notification, so we'll never get a notification > for > + * this work - so continue polling mode for a > while. */ > + vhost_disable_notify(vq->dev, vq); > + vq->vqpoll.enabled = true; > + vhost_enable_notify(vq->dev, vq); > + return; > + } > + } > + > + if (vq->vqpoll.avail_mapped) { > + kunmap(vq->vqpoll.avail_page); > + put_page(vq->vqpoll.avail_page); > + vq->vqpoll.avail_mapped = 0; > + } > +} > + > static void vhost_vq_reset(struct vhost_dev *dev, > struct vhost_virtqueue *vq) > { > @@ -199,6 +287,48 @@ static void vhost_vq_reset(struct vhost_dev *dev, > vq->call = NULL; > vq->log_ctx = NULL; > vq->memory = NULL; > + INIT_LIST_HEAD(&vq->vqpoll.link); > + vq->vqpoll.enabled = false; > + vq->vqpoll.shutdown = false; > + vq->vqpoll.avail_mapped = NULL; > +} > + > +/* roundrobin_poll() takes worker->vqpoll_list, and returns one of the > + * virtqueues which the caller should kick, or NULL in case none should > be > + * kicked. roundrobin_poll() also disables polling on a virtqueue which > has > + * been polled for too long without success. > + * > + * This current implementation (the "round-robin" implementation) only > + * polls the first vq in the list, returning it or NULL as appropriate, > and > + * moves this vq to the end of the list, so next time a different one is > + * polled. > + */ > +static struct vhost_virtqueue *roundrobin_poll(struct list_head *list) { > + struct vhost_virtqueue *vq; > + u16 avail_idx; > + > + > + if (list_empty(list)) > + return NULL; > + > + vq = list_first_entry(list, struct vhost_virtqueue, vqpoll.link); > + WARN_ON(!vq->vqpoll.enabled); > + list_move_tail(&vq->vqpoll.link, list); > + > + /* See if there is any new work available from the guest. */ > + /* TODO: can check the optional idx feature, and if we haven't > + * reached that idx yet, don't kick... */ > + avail_idx = vq->vqpoll.avail_mapped->idx; > + if (avail_idx != vq->last_avail_idx) { > + return vq; > + } > + if (jiffies > vq->vqpoll.jiffies_last_kick + poll_stop_idle) { > + /* We've been polling this virtqueue for a long time with > no > + * results, so switch back to guest notification > + */ > + vhost_vq_disable_vqpoll(vq); > + } > + return NULL; > } > > static int vhost_worker(void *data) > @@ -237,12 +367,66 @@ static int vhost_worker(void *data) > spin_unlock_irq(&dev->work_lock); > > if (work) { > + struct vhost_virtqueue *vq = work->vq; > __set_current_state(TASK_RUNNING); > work->fn(work); > + /* Keep track of the work rate, for deciding when > to > + * enable polling */ > + if (vq) { > + if (vq->vqpoll.jiffies_last_work != > jiffies) { > + vq->vqpoll.jiffies_last_work = > jiffies; > + vq->vqpoll.work_this_jiffy = 0; > + } > + vq->vqpoll.work_this_jiffy++; > + } > + /* If vq is in the round-robin list of virtqueues > being > + * constantly checked by this thread, move vq the > end > + * of the queue, because it had its fair chance > now. > + */ > + if (vq && !list_empty(&vq->vqpoll.link)) { > + list_move_tail(&vq->vqpoll.link, > + &dev->vqpoll_list); > + } > + /* Otherwise, if this vq is looking for > notifications > + * but vq polling is not enabled for it, do it > now. > + */ > + else if (poll_start_rate && vq && vq->handle_kick > && > + !vq->vqpoll.enabled && > + !vq->vqpoll.shutdown && > + !(vq->used_flags & VRING_USED_F_NO_NOTIFY) > && > + vq->vqpoll.work_this_jiffy >= > + poll_start_rate) { > + vhost_vq_enable_vqpoll(vq); > + } > + } > + /* Check one virtqueue from the round-robin list */ > + if (!list_empty(&dev->vqpoll_list)) { If we still have another work in work_list, we may want to serve it first. [...] > struct vhost_dev { > @@ -123,6 +151,7 @@ struct vhost_dev { > spinlock_t work_lock; > struct list_head work_list; > struct task_struct *worker; > + struct list_head vqpoll_list; > }; > > void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int > nvqs); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html