On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote: > Replace vhost_workqueue with per-vhost kthread. Other than callback > argument change from struct work_struct * to struct vhost_poll *, > there's no visible change to vhost_poll_*() interface. I would prefer a substructure vhost_work, even just to make the code easier to review and compare to workqueue.c. > This conversion is to make each vhost use a dedicated kthread so that > resource control via cgroup can be applied. > > Partially based on Sridhar Samudrala's patch. > > Cc: Michael S. Tsirkin <mst@xxxxxxxxxx> > Cc: Sridhar Samudrala <samudrala.sridhar@xxxxxxxxx> > --- > Okay, here is three patch series to convert vhost to use per-vhost > kthread, add cgroup_attach_task_current_cg() and apply it to the vhost > kthreads. The conversion is mostly straight forward although flush is > slightly tricky. > > The problem is that I have no idea how to test this. It's a 3 step process: 1. Install qemu-kvm under fc13, or build recent one from source, get it from here: git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git 2. install guest under it: qemu-img create -f qcow2 disk.qcow2 100G Now get some image (e.g. fedora 13 in fc13.iso) and install guest: qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2 3. set up networking. I usually simply do host to guest on a special subnet for testing purposes: Set up a bridge named mstbr0: ifconfig mstbr0 down brctl delbr mstbr0 brctl addbr mstbr0 brctl setfd mstbr0 0 ifconfig mstbr0 11.0.0.1 cat > ifup << EOF #!/bin/sh -x /sbin/ifconfig msttap0 0.0.0.0 up brctl addif mstbr0 msttap0 EOF qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2 -net nic,model=virtio,netdev=foo -netdev tap,id=foo,ifname=msttap0,script=/home/mst/ifup,downscript=no,vhost=on after you set up the guest, log into it and ifconfig eth0 11.0.0.2 You should now be able to ping guest to host and back. Use something like netperf to stress the connection. Close qemu with kill -9 and unload module to test flushing code. > Index: work/drivers/vhost/vhost.c > =================================================================== > --- work.orig/drivers/vhost/vhost.c > +++ work/drivers/vhost/vhost.c ... > @@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_ > vq->log_ctx = NULL; > } > > +static int vhost_poller(void *data) > +{ > + struct vhost_dev *dev = data; > + struct vhost_poll *poll; > + > +repeat: > + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ > + > + if (kthread_should_stop()) { > + __set_current_state(TASK_RUNNING); > + return 0; > + } > + > + poll = NULL; > + spin_lock(&dev->poller_lock); > + if (!list_empty(&dev->poll_list)) { > + poll = list_first_entry(&dev->poll_list, > + struct vhost_poll, node); > + list_del_init(&poll->node); > + } > + spin_unlock(&dev->poller_lock); > + > + if (poll) { > + __set_current_state(TASK_RUNNING); > + poll->fn(poll); > + smp_wmb(); /* paired with rmb in vhost_poll_flush() */ > + poll->done_seq = poll->queue_seq; > + wake_up_all(&poll->done); This seems to add wakeups on data path, which uses spinlocks etc. OTOH workqueue.c adds a special barrier entry which only does a wakeup when needed. Right? > + } else > + schedule(); > + > + goto repeat; > +} > + > long vhost_dev_init(struct vhost_dev *dev, > struct vhost_virtqueue *vqs, int nvqs) > { > + struct task_struct *poller; > int i; > + > + poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid); > + if (IS_ERR(poller)) > + return PTR_ERR(poller); > + > dev->vqs = vqs; > dev->nvqs = nvqs; > mutex_init(&dev->mutex); > @@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de > dev->log_file = NULL; > dev->memory = NULL; > dev->mm = NULL; > + spin_lock_init(&dev->poller_lock); > + INIT_LIST_HEAD(&dev->poll_list); > + dev->poller = poller; > > for (i = 0; i < dev->nvqs; ++i) { > dev->vqs[i].dev = dev; > @@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de > vhost_vq_reset(dev, dev->vqs + i); > if (dev->vqs[i].handle_kick) > vhost_poll_init(&dev->vqs[i].poll, > - dev->vqs[i].handle_kick, > - POLLIN); > + dev->vqs[i].handle_kick, POLLIN, dev); > } > return 0; > } > @@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev > if (dev->mm) > mmput(dev->mm); > dev->mm = NULL; > + > + kthread_stop(dev->poller); > } > > static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) > @@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v > vq_err(vq, "Failed to enable notification at %p: %d\n", > &vq->used->flags, r); > } > - > -int vhost_init(void) > -{ > - vhost_workqueue = create_singlethread_workqueue("vhost"); > - if (!vhost_workqueue) > - return -ENOMEM; > - return 0; > -} > - > -void vhost_cleanup(void) > -{ > - destroy_workqueue(vhost_workqueue); I note that destroy_workqueue does a flush, kthread_stop doesn't. Right? Sure we don't need to check nothing is in one of the lists? Maybe add a BUG_ON? > -} > Index: work/drivers/vhost/vhost.h > =================================================================== > --- work.orig/drivers/vhost/vhost.h > +++ work/drivers/vhost/vhost.h > @@ -5,7 +5,6 @@ > #include <linux/vhost.h> > #include <linux/mm.h> > #include <linux/mutex.h> > -#include <linux/workqueue.h> > #include <linux/poll.h> > #include <linux/file.h> > #include <linux/skbuff.h> > @@ -20,19 +19,26 @@ enum { > VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, > }; > > +struct vhost_poll; > +typedef void (*vhost_poll_fn_t)(struct vhost_poll *poll); > + > /* Poll a file (eventfd or socket) */ > /* Note: there's nothing vhost specific about this structure. */ > struct vhost_poll { > + vhost_poll_fn_t fn; > poll_table table; > wait_queue_head_t *wqh; > wait_queue_t wait; > - /* struct which will handle all actual work. */ > - struct work_struct work; > + struct list_head node; > + wait_queue_head_t done; > unsigned long mask; > + struct vhost_dev *dev; > + int queue_seq; > + int done_seq; > }; > > -void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > - unsigned long mask); > +void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn, > + unsigned long mask, struct vhost_dev *dev); > void vhost_poll_start(struct vhost_poll *poll, struct file *file); > void vhost_poll_stop(struct vhost_poll *poll); > void vhost_poll_flush(struct vhost_poll *poll); > @@ -63,7 +69,7 @@ struct vhost_virtqueue { > struct vhost_poll poll; > > /* The routine to call when the Guest pings us, or timeout. */ > - work_func_t handle_kick; > + vhost_poll_fn_t handle_kick; > > /* Last available index we saw. */ > u16 last_avail_idx; > @@ -86,11 +92,11 @@ struct vhost_virtqueue { > struct iovec hdr[VHOST_NET_MAX_SG]; > size_t hdr_size; > /* We use a kind of RCU to access private pointer. > - * All readers access it from workqueue, which makes it possible to > - * flush the workqueue instead of synchronize_rcu. Therefore readers do > + * All readers access it from poller, which makes it possible to > + * flush the vhost_poll instead of synchronize_rcu. Therefore readers do > * not need to call rcu_read_lock/rcu_read_unlock: the beginning of > - * work item execution acts instead of rcu_read_lock() and the end of > - * work item execution acts instead of rcu_read_lock(). > + * vhost_poll execution acts instead of rcu_read_lock() and the end of > + * vhost_poll execution acts instead of rcu_read_lock(). > * Writers use virtqueue mutex. */ > void *private_data; > /* Log write descriptors */ > @@ -110,6 +116,9 @@ struct vhost_dev { > int nvqs; > struct file *log_file; > struct eventfd_ctx *log_ctx; > + spinlock_t poller_lock; > + struct list_head poll_list; > + struct task_struct *poller; > }; > > long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); > @@ -136,9 +145,6 @@ bool vhost_enable_notify(struct vhost_vi > int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, > unsigned int log_num, u64 len); > > -int vhost_init(void); > -void vhost_cleanup(void); > - > #define vq_err(vq, fmt, ...) do { \ > pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ > if ((vq)->error_ctx) \ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html