"Michael S. Tsirkin" <mst@xxxxxxxxxx> writes: > On Mon, Jul 13, 2015 at 12:07:32AM -0400, Bandan Das wrote: >> vhost threads are per-device, but in most cases a single thread >> is enough. This change creates a single thread that is used to >> serve all guests. >> >> However, this complicates cgroups associations. The current policy >> is to attach the per-device thread to all cgroups of the parent process >> that the device is associated it. This is no longer possible if we >> have a single thread. So, we end up moving the thread around to >> cgroups of whichever device that needs servicing. This is a very >> inefficient protocol but seems to be the only way to integrate >> cgroups support. >> >> Signed-off-by: Razya Ladelsky <razya@xxxxxxxxxx> >> Signed-off-by: Bandan Das <bsd@xxxxxxxxxx> > > BTW, how does this interact with virtio net MQ? > It would seem that MQ gains from more parallelism and > CPU locality. Hm.. Good point. As of this version, this design will always have one worker thread servicing a guest. Now suppose we have 10 virtio queues for a guest, surely, we could benefit from spawning off another worker just like we are doing in case of a new guest/device with the devs_per_worker parameter. >> --- >> drivers/vhost/scsi.c | 15 +++-- >> drivers/vhost/vhost.c | 150 ++++++++++++++++++++++++-------------------------- >> drivers/vhost/vhost.h | 19 +++++-- >> 3 files changed, 97 insertions(+), 87 deletions(-) >> >> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c >> index ea32b38..6c42936 100644 >> --- a/drivers/vhost/scsi.c >> +++ b/drivers/vhost/scsi.c >> @@ -535,7 +535,7 @@ static void vhost_scsi_complete_cmd(struct vhost_scsi_cmd *cmd) >> >> llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list); >> >> - vhost_work_queue(&vs->dev, &vs->vs_completion_work); >> + vhost_work_queue(vs->dev.worker, &vs->vs_completion_work); >> } >> >> static int vhost_scsi_queue_data_in(struct se_cmd *se_cmd) >> @@ -1282,7 +1282,7 @@ vhost_scsi_send_evt(struct vhost_scsi *vs, >> } >> >> llist_add(&evt->list, &vs->vs_event_list); >> - vhost_work_queue(&vs->dev, &vs->vs_event_work); >> + vhost_work_queue(vs->dev.worker, &vs->vs_event_work); >> } >> >> static void vhost_scsi_evt_handle_kick(struct vhost_work *work) >> @@ -1335,8 +1335,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) >> /* Flush both the vhost poll and vhost work */ >> for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) >> vhost_scsi_flush_vq(vs, i); >> - vhost_work_flush(&vs->dev, &vs->vs_completion_work); >> - vhost_work_flush(&vs->dev, &vs->vs_event_work); >> + vhost_work_flush(vs->dev.worker, &vs->vs_completion_work); >> + vhost_work_flush(vs->dev.worker, &vs->vs_event_work); >> >> /* Wait for all reqs issued before the flush to be finished */ >> for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) >> @@ -1584,8 +1584,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) >> if (!vqs) >> goto err_vqs; >> >> - vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work); >> - vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work); >> + vhost_work_init(&vs->dev, &vs->vs_completion_work, >> + vhost_scsi_complete_cmd_work); >> + >> + vhost_work_init(&vs->dev, &vs->vs_event_work, >> + vhost_scsi_evt_work); >> >> vs->vs_events_nr = 0; >> vs->vs_events_missed = false; >> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >> index 2ee2826..951c96b 100644 >> --- a/drivers/vhost/vhost.c >> +++ b/drivers/vhost/vhost.c >> @@ -11,6 +11,8 @@ >> * Generic code for virtio server in host kernel. >> */ >> >> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> + >> #include <linux/eventfd.h> >> #include <linux/vhost.h> >> #include <linux/uio.h> >> @@ -28,6 +30,9 @@ >> >> #include "vhost.h" >> >> +/* Just one worker thread to service all devices */ >> +static struct vhost_worker *worker; >> + >> enum { >> VHOST_MEMORY_MAX_NREGIONS = 64, >> VHOST_MEMORY_F_LOG = 0x1, >> @@ -58,13 +63,15 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, >> return 0; >> } >> >> -void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) >> +void vhost_work_init(struct vhost_dev *dev, >> + struct vhost_work *work, vhost_work_fn_t fn) >> { >> INIT_LIST_HEAD(&work->node); >> work->fn = fn; >> init_waitqueue_head(&work->done); >> work->flushing = 0; >> work->queue_seq = work->done_seq = 0; >> + work->dev = dev; >> } >> EXPORT_SYMBOL_GPL(vhost_work_init); >> >> @@ -78,7 +85,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, >> poll->dev = dev; >> poll->wqh = NULL; >> >> - vhost_work_init(&poll->work, fn); >> + vhost_work_init(dev, &poll->work, fn); >> } >> EXPORT_SYMBOL_GPL(vhost_poll_init); >> >> @@ -116,30 +123,30 @@ void vhost_poll_stop(struct vhost_poll *poll) >> } >> EXPORT_SYMBOL_GPL(vhost_poll_stop); >> >> -static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, >> - unsigned seq) >> +static bool vhost_work_seq_done(struct vhost_worker *worker, >> + struct vhost_work *work, unsigned seq) >> { >> int left; >> >> - spin_lock_irq(&dev->work_lock); >> + spin_lock_irq(&worker->work_lock); >> left = seq - work->done_seq; >> - spin_unlock_irq(&dev->work_lock); >> + spin_unlock_irq(&worker->work_lock); >> return left <= 0; >> } >> >> -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) >> +void vhost_work_flush(struct vhost_worker *worker, struct vhost_work *work) >> { >> unsigned seq; >> int flushing; >> >> - spin_lock_irq(&dev->work_lock); >> + spin_lock_irq(&worker->work_lock); >> seq = work->queue_seq; >> work->flushing++; >> - spin_unlock_irq(&dev->work_lock); >> - wait_event(work->done, vhost_work_seq_done(dev, work, seq)); >> - spin_lock_irq(&dev->work_lock); >> + spin_unlock_irq(&worker->work_lock); >> + wait_event(work->done, vhost_work_seq_done(worker, work, seq)); >> + spin_lock_irq(&worker->work_lock); >> flushing = --work->flushing; >> - spin_unlock_irq(&dev->work_lock); >> + spin_unlock_irq(&worker->work_lock); >> BUG_ON(flushing < 0); >> } >> EXPORT_SYMBOL_GPL(vhost_work_flush); >> @@ -148,29 +155,30 @@ EXPORT_SYMBOL_GPL(vhost_work_flush); >> * locks that are also used by the callback. */ >> void vhost_poll_flush(struct vhost_poll *poll) >> { >> - vhost_work_flush(poll->dev, &poll->work); >> + vhost_work_flush(poll->dev->worker, &poll->work); >> } >> EXPORT_SYMBOL_GPL(vhost_poll_flush); >> >> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) >> +void vhost_work_queue(struct vhost_worker *worker, >> + struct vhost_work *work) >> { >> unsigned long flags; >> >> - spin_lock_irqsave(&dev->work_lock, flags); >> + spin_lock_irqsave(&worker->work_lock, flags); >> if (list_empty(&work->node)) { >> - list_add_tail(&work->node, &dev->work_list); >> + list_add_tail(&work->node, &worker->work_list); >> work->queue_seq++; >> - spin_unlock_irqrestore(&dev->work_lock, flags); >> - wake_up_process(dev->worker); >> + spin_unlock_irqrestore(&worker->work_lock, flags); >> + wake_up_process(worker->thread); >> } else { >> - spin_unlock_irqrestore(&dev->work_lock, flags); >> + spin_unlock_irqrestore(&worker->work_lock, flags); >> } >> } >> EXPORT_SYMBOL_GPL(vhost_work_queue); >> >> void vhost_poll_queue(struct vhost_poll *poll) >> { >> - vhost_work_queue(poll->dev, &poll->work); >> + vhost_work_queue(poll->dev->worker, &poll->work); >> } >> EXPORT_SYMBOL_GPL(vhost_poll_queue); >> >> @@ -203,19 +211,18 @@ static void vhost_vq_reset(struct vhost_dev *dev, >> >> static int vhost_worker(void *data) >> { >> - struct vhost_dev *dev = data; >> + struct vhost_worker *worker = data; >> struct vhost_work *work = NULL; >> unsigned uninitialized_var(seq); >> mm_segment_t oldfs = get_fs(); >> >> set_fs(USER_DS); >> - use_mm(dev->mm); >> >> for (;;) { >> /* mb paired w/ kthread_stop */ >> set_current_state(TASK_INTERRUPTIBLE); >> >> - spin_lock_irq(&dev->work_lock); >> + spin_lock_irq(&worker->work_lock); >> if (work) { >> work->done_seq = seq; >> if (work->flushing) >> @@ -223,21 +230,35 @@ static int vhost_worker(void *data) >> } >> >> if (kthread_should_stop()) { >> - spin_unlock_irq(&dev->work_lock); >> + spin_unlock_irq(&worker->work_lock); >> __set_current_state(TASK_RUNNING); >> break; >> } >> - if (!list_empty(&dev->work_list)) { >> - work = list_first_entry(&dev->work_list, >> + if (!list_empty(&worker->work_list)) { >> + work = list_first_entry(&worker->work_list, >> struct vhost_work, node); >> list_del_init(&work->node); >> seq = work->queue_seq; >> } else >> work = NULL; >> - spin_unlock_irq(&dev->work_lock); >> + spin_unlock_irq(&worker->work_lock); >> >> if (work) { >> + struct vhost_dev *dev = work->dev; >> + >> __set_current_state(TASK_RUNNING); >> + >> + if (current->mm != dev->mm) { >> + unuse_mm(current->mm); >> + use_mm(dev->mm); >> + } >> + >> + /* TODO: Consider a more elegant solution */ >> + if (worker->owner != dev->owner) { >> + /* Should check for return value */ >> + cgroup_attach_task_all(dev->owner, current); >> + worker->owner = dev->owner; >> + } >> work->fn(work); >> if (need_resched()) >> schedule(); >> @@ -245,7 +266,6 @@ static int vhost_worker(void *data) >> schedule(); >> >> } >> - unuse_mm(dev->mm); >> set_fs(oldfs); >> return 0; >> } >> @@ -304,9 +324,8 @@ void vhost_dev_init(struct vhost_dev *dev, >> dev->log_file = NULL; >> dev->memory = NULL; >> dev->mm = NULL; >> - spin_lock_init(&dev->work_lock); >> - INIT_LIST_HEAD(&dev->work_list); >> - dev->worker = NULL; >> + dev->worker = worker; >> + dev->owner = current; >> >> for (i = 0; i < dev->nvqs; ++i) { >> vq = dev->vqs[i]; >> @@ -331,31 +350,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev) >> } >> EXPORT_SYMBOL_GPL(vhost_dev_check_owner); >> >> -struct vhost_attach_cgroups_struct { >> - struct vhost_work work; >> - struct task_struct *owner; >> - int ret; >> -}; >> - >> -static void vhost_attach_cgroups_work(struct vhost_work *work) >> -{ >> - struct vhost_attach_cgroups_struct *s; >> - >> - s = container_of(work, struct vhost_attach_cgroups_struct, work); >> - s->ret = cgroup_attach_task_all(s->owner, current); >> -} >> - >> -static int vhost_attach_cgroups(struct vhost_dev *dev) >> -{ >> - struct vhost_attach_cgroups_struct attach; >> - >> - attach.owner = current; >> - vhost_work_init(&attach.work, vhost_attach_cgroups_work); >> - vhost_work_queue(dev, &attach.work); >> - vhost_work_flush(dev, &attach.work); >> - return attach.ret; >> -} >> - >> /* Caller should have device mutex */ >> bool vhost_dev_has_owner(struct vhost_dev *dev) >> { >> @@ -366,7 +360,6 @@ EXPORT_SYMBOL_GPL(vhost_dev_has_owner); >> /* Caller should have device mutex */ >> long vhost_dev_set_owner(struct vhost_dev *dev) >> { >> - struct task_struct *worker; >> int err; >> >> /* Is there an owner already? */ >> @@ -377,28 +370,15 @@ long vhost_dev_set_owner(struct vhost_dev *dev) >> >> /* No owner, become one */ >> dev->mm = get_task_mm(current); >> - worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); >> - if (IS_ERR(worker)) { >> - err = PTR_ERR(worker); >> - goto err_worker; >> - } >> - >> dev->worker = worker; >> - wake_up_process(worker); /* avoid contributing to loadavg */ >> - >> - err = vhost_attach_cgroups(dev); >> - if (err) >> - goto err_cgroup; >> >> err = vhost_dev_alloc_iovecs(dev); >> if (err) >> - goto err_cgroup; >> + goto err_alloc; >> >> return 0; >> -err_cgroup: >> - kthread_stop(worker); >> +err_alloc: >> dev->worker = NULL; >> -err_worker: >> if (dev->mm) >> mmput(dev->mm); >> dev->mm = NULL; >> @@ -472,11 +452,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) >> /* No one will access memory at this point */ >> kfree(dev->memory); >> dev->memory = NULL; >> - WARN_ON(!list_empty(&dev->work_list)); >> - if (dev->worker) { >> - kthread_stop(dev->worker); >> - dev->worker = NULL; >> - } >> if (dev->mm) >> mmput(dev->mm); >> dev->mm = NULL; >> @@ -1567,11 +1542,32 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify); >> >> static int __init vhost_init(void) >> { >> + struct vhost_worker *w = >> + kzalloc(sizeof(*w), GFP_KERNEL); >> + if (!w) >> + return -ENOMEM; >> + >> + w->thread = kthread_create(vhost_worker, >> + w, "vhost-worker"); >> + if (IS_ERR(w->thread)) >> + return PTR_ERR(w->thread); >> + >> + worker = w; >> + spin_lock_init(&worker->work_lock); >> + INIT_LIST_HEAD(&worker->work_list); >> + wake_up_process(worker->thread); >> + pr_info("Created universal thread to service requests\n"); >> + >> return 0; >> } >> >> static void __exit vhost_exit(void) >> { >> + if (worker) { >> + kthread_stop(worker->thread); >> + WARN_ON(!list_empty(&worker->work_list)); >> + kfree(worker); >> + } >> } >> >> module_init(vhost_init); >> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h >> index 8c1c792..2f204ce 100644 >> --- a/drivers/vhost/vhost.h >> +++ b/drivers/vhost/vhost.h >> @@ -22,6 +22,7 @@ struct vhost_work { >> int flushing; >> unsigned queue_seq; >> unsigned done_seq; >> + struct vhost_dev *dev; >> }; >> >> /* Poll a file (eventfd or socket) */ >> @@ -35,8 +36,8 @@ struct vhost_poll { >> struct vhost_dev *dev; >> }; >> >> -void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); >> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work); >> +void vhost_work_init(struct vhost_dev *dev, >> + struct vhost_work *work, vhost_work_fn_t fn); >> >> void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, >> unsigned long mask, struct vhost_dev *dev); >> @@ -44,7 +45,6 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file); >> void vhost_poll_stop(struct vhost_poll *poll); >> void vhost_poll_flush(struct vhost_poll *poll); >> void vhost_poll_queue(struct vhost_poll *poll); >> -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work); >> long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp); >> >> struct vhost_log { >> @@ -116,11 +116,22 @@ struct vhost_dev { >> int nvqs; >> struct file *log_file; >> struct eventfd_ctx *log_ctx; >> + /* vhost shared worker */ >> + struct vhost_worker *worker; >> + /* for cgroup support */ >> + struct task_struct *owner; >> +}; >> + >> +struct vhost_worker { >> spinlock_t work_lock; >> struct list_head work_list; >> - struct task_struct *worker; >> + struct task_struct *thread; >> + struct task_struct *owner; >> }; >> >> +void vhost_work_queue(struct vhost_worker *worker, >> + struct vhost_work *work); >> +void vhost_work_flush(struct vhost_worker *worker, struct vhost_work *work); >> void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); >> long vhost_dev_set_owner(struct vhost_dev *dev); >> bool vhost_dev_has_owner(struct vhost_dev *dev); >> -- >> 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html