Re: [PATCH v6 01/11] vhost: add vhost_worker pointer to vhost_virtqueue

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Tue, 4 Apr 2023 14:38:01 -0400

On Mon, Mar 27, 2023 at 09:17:07PM -0500, Mike Christie wrote:
> This patchset allows userspace to map vqs to different workers. This
> patch adds a worker pointer to the vq so we can store that info.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>

Thanks! Conflicts with a bunch of refactorings upstream:
could you rebase this on my tree and repost?
I need to queue this soon so it gets time in -next.

> ---
>  drivers/vhost/vhost.c | 24 +++++++++++++-----------
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 14 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 4368ee9b999c..e041e116afee 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -486,6 +486,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->log = NULL;
>  		vq->indirect = NULL;
>  		vq->heads = NULL;
> +		vq->worker = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
>  		vhost_vq_reset(dev, vq);
> @@ -554,16 +555,15 @@ static void vhost_worker_free(struct vhost_dev *dev)
>  	kfree(worker);
>  }
>  
> -static int vhost_worker_create(struct vhost_dev *dev)
> +static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>  {
>  	struct vhost_worker *worker;
>  	struct vhost_task *vtsk;
>  	char name[TASK_COMM_LEN];
> -	int ret;
>  
>  	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
>  	if (!worker)
> -		return -ENOMEM;
> +		return NULL;
>  
>  	dev->worker = worker;
>  	worker->kcov_handle = kcov_common_handle();
> @@ -571,25 +571,24 @@ static int vhost_worker_create(struct vhost_dev *dev)
>  	snprintf(name, sizeof(name), "vhost-%d", current->pid);
>  
>  	vtsk = vhost_task_create(vhost_worker, worker, name);
> -	if (!vtsk) {
> -		ret = -ENOMEM;
> +	if (!vtsk)
>  		goto free_worker;
> -	}
>  
>  	worker->vtsk = vtsk;
>  	vhost_task_start(vtsk);
> -	return 0;
> +	return worker;
>  
>  free_worker:
>  	kfree(worker);
>  	dev->worker = NULL;
> -	return ret;
> +	return NULL;
>  }
>  
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> -	int err;
> +	struct vhost_worker *worker;
> +	int err, i;
>  
>  	/* Is there an owner already? */
>  	if (vhost_dev_has_owner(dev)) {
> @@ -600,9 +599,12 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	vhost_attach_mm(dev);
>  
>  	if (dev->use_worker) {
> -		err = vhost_worker_create(dev);
> -		if (err)
> +		worker = vhost_worker_create(dev);
> +		if (!worker)
>  			goto err_worker;
> +
> +		for (i = 0; i < dev->nvqs; i++)
> +			dev->vqs[i]->worker = worker;
>  	}
>  
>  	err = vhost_dev_alloc_iovecs(dev);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 0308638cdeee..e72b665ba3a5 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -74,6 +74,7 @@ struct vhost_vring_call {
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> +	struct vhost_worker *worker;
>  
>  	/* The actual ring of buffers. */
>  	struct mutex mutex;
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:08PM -0500, Mike Christie wrote:
> In the next patches each vq might have different workers so one could
> have work but others do not. For net, we only want to check specific vqs,
> so this adds a helper to check if a vq has work pending and converts
> vhost-net to use it.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/net.c   | 2 +-
>  drivers/vhost/vhost.c | 6 +++---
>  drivers/vhost/vhost.h | 2 +-
>  3 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 07181cd8d52e..8ed63651b9eb 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -546,7 +546,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>  	endtime = busy_clock() + busyloop_timeout;
>  
>  	while (vhost_can_busy_poll(endtime)) {
> -		if (vhost_has_work(&net->dev)) {
> +		if (vhost_vq_has_work(vq)) {
>  			*busyloop_intr = true;
>  			break;
>  		}
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index e041e116afee..6567aed69ebb 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -262,11 +262,11 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
>  EXPORT_SYMBOL_GPL(vhost_work_queue);
>  
>  /* A lockless hint for busy polling code to exit the loop */
> -bool vhost_has_work(struct vhost_dev *dev)
> +bool vhost_vq_has_work(struct vhost_virtqueue *vq)
>  {
> -	return dev->worker && !llist_empty(&dev->worker->work_list);
> +	return vq->worker && !llist_empty(&vq->worker->work_list);
>  }
> -EXPORT_SYMBOL_GPL(vhost_has_work);
> +EXPORT_SYMBOL_GPL(vhost_vq_has_work);
>  
>  void vhost_poll_queue(struct vhost_poll *poll)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index e72b665ba3a5..0dde119fb0ee 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -45,7 +45,6 @@ struct vhost_poll {
>  
>  void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
>  void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
> -bool vhost_has_work(struct vhost_dev *dev);
>  
>  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
>  		     __poll_t mask, struct vhost_dev *dev);
> @@ -195,6 +194,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
>  		      struct vhost_log *log, unsigned int *log_num);
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
> +bool vhost_vq_has_work(struct vhost_virtqueue *vq);
>  bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
>  int vhost_vq_init_access(struct vhost_virtqueue *);
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:09PM -0500, Mike Christie wrote:
> This patch has the core work queueing function take a worker for when we
> support multiple workers. It also adds a helper that takes a vq during
> queueing so modules can control which vq/worker to queue work on.
> 
> This temp leaves vhost_work_queue. It will be removed when the drivers
> are converted in the next patches.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vhost.c | 44 +++++++++++++++++++++++++++----------------
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 29 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6567aed69ebb..cc2628ba9a77 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -231,6 +231,34 @@ void vhost_poll_stop(struct vhost_poll *poll)
>  }
>  EXPORT_SYMBOL_GPL(vhost_poll_stop);
>  
> +static void vhost_work_queue_on(struct vhost_worker *worker,
> +				struct vhost_work *work)
> +{
> +	if (!worker)
> +		return;
> +
> +	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
> +		/* We can only add the work to the list after we're
> +		 * sure it was not in the list.
> +		 * test_and_set_bit() implies a memory barrier.
> +		 */
> +		llist_add(&work->node, &worker->work_list);
> +		wake_up_process(worker->vtsk->task);
> +	}
> +}
> +
> +void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
> +{
> +	vhost_work_queue_on(dev->worker, work);
> +}
> +EXPORT_SYMBOL_GPL(vhost_work_queue);
> +
> +void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
> +{
> +	vhost_work_queue_on(vq->worker, work);
> +}
> +EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
> +
>  void vhost_dev_flush(struct vhost_dev *dev)
>  {
>  	struct vhost_flush_struct flush;
> @@ -245,22 +273,6 @@ void vhost_dev_flush(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_flush);
>  
> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
> -{
> -	if (!dev->worker)
> -		return;
> -
> -	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
> -		/* We can only add the work to the list after we're
> -		 * sure it was not in the list.
> -		 * test_and_set_bit() implies a memory barrier.
> -		 */
> -		llist_add(&work->node, &dev->worker->work_list);
> -		wake_up_process(dev->worker->vtsk->task);
> -	}
> -}
> -EXPORT_SYMBOL_GPL(vhost_work_queue);
> -
>  /* A lockless hint for busy polling code to exit the loop */
>  bool vhost_vq_has_work(struct vhost_virtqueue *vq)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 0dde119fb0ee..b64ee4ef387d 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -194,6 +194,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
>  		      struct vhost_log *log, unsigned int *log_num);
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
> +void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
>  bool vhost_vq_has_work(struct vhost_virtqueue *vq);
>  bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
>  int vhost_vq_init_access(struct vhost_virtqueue *);
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:10PM -0500, Mike Christie wrote:
> This patch has the core work flush function take a worker. When we
> support multiple workers we can then flush each worker during device
> removal, stoppage, etc.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vhost.c | 24 +++++++++++++++---------
>  1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index cc2628ba9a77..6160aa1cc922 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -247,6 +247,20 @@ static void vhost_work_queue_on(struct vhost_worker *worker,
>  	}
>  }
>  
> +static void vhost_work_flush_on(struct vhost_worker *worker)
> +{
> +	struct vhost_flush_struct flush;
> +
> +	if (!worker)
> +		return;
> +
> +	init_completion(&flush.wait_event);
> +	vhost_work_init(&flush.work, vhost_flush_work);
> +
> +	vhost_work_queue_on(worker, &flush.work);
> +	wait_for_completion(&flush.wait_event);
> +}
> +
>  void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
>  {
>  	vhost_work_queue_on(dev->worker, work);
> @@ -261,15 +275,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
>  
>  void vhost_dev_flush(struct vhost_dev *dev)
>  {
> -	struct vhost_flush_struct flush;
> -
> -	if (dev->worker) {
> -		init_completion(&flush.wait_event);
> -		vhost_work_init(&flush.work, vhost_flush_work);
> -
> -		vhost_work_queue(dev, &flush.work);
> -		wait_for_completion(&flush.wait_event);
> -	}
> +	vhost_work_flush_on(dev->worker);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_flush);
>  
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:11PM -0500, Mike Christie wrote:
> This has the drivers pass in their poll to vq mapping and then converts
> the core poll code to use the vq based helpers. In the next patches we
> will allow vqs to be handled by different workers, so to allow drivers
> to execute operations like queue, stop, flush, etc on specific polls/vqs
> we need to know the mappings.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/net.c   | 6 ++++--
>  drivers/vhost/vhost.c | 8 +++++---
>  drivers/vhost/vhost.h | 4 +++-
>  3 files changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 8ed63651b9eb..4a9b757071a2 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -1342,8 +1342,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  		       VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
>  		       NULL);
>  
> -	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
> -	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
> +	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
> +			vqs[VHOST_NET_VQ_TX]);
> +	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
> +			vqs[VHOST_NET_VQ_RX]);
>  
>  	f->private_data = n;
>  	n->page_frag.page = NULL;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6160aa1cc922..6968f8fc17e8 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -187,13 +187,15 @@ EXPORT_SYMBOL_GPL(vhost_work_init);
>  
>  /* Init poll structure */
>  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
> -		     __poll_t mask, struct vhost_dev *dev)
> +		     __poll_t mask, struct vhost_dev *dev,
> +		     struct vhost_virtqueue *vq)
>  {
>  	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
>  	init_poll_funcptr(&poll->table, vhost_poll_func);
>  	poll->mask = mask;
>  	poll->dev = dev;
>  	poll->wqh = NULL;
> +	poll->vq = vq;
>  
>  	vhost_work_init(&poll->work, fn);
>  }
> @@ -288,7 +290,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_has_work);
>  
>  void vhost_poll_queue(struct vhost_poll *poll)
>  {
> -	vhost_work_queue(poll->dev, &poll->work);
> +	vhost_vq_work_queue(poll->vq, &poll->work);
>  }
>  EXPORT_SYMBOL_GPL(vhost_poll_queue);
>  
> @@ -510,7 +512,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> -					EPOLLIN, dev);
> +					EPOLLIN, dev, vq);
>  	}
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_init);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b64ee4ef387d..d9b8abbe3a26 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -41,13 +41,15 @@ struct vhost_poll {
>  	struct vhost_work	work;
>  	__poll_t		mask;
>  	struct vhost_dev	*dev;
> +	struct vhost_virtqueue	*vq;
>  };
>  
>  void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
>  void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
>  
>  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
> -		     __poll_t mask, struct vhost_dev *dev);
> +		     __poll_t mask, struct vhost_dev *dev,
> +		     struct vhost_virtqueue *vq);
>  int vhost_poll_start(struct vhost_poll *poll, struct file *file);
>  void vhost_poll_stop(struct vhost_poll *poll);
>  void vhost_poll_queue(struct vhost_poll *poll);
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:12PM -0500, Mike Christie wrote:
> Convert from vhost_work_queue to vhost_vq_work_queue.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vsock.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index c8e6087769a1..1dcbc8669f95 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -285,7 +285,7 @@ vhost_transport_send_pkt(struct sk_buff *skb)
>  		atomic_inc(&vsock->queued_replies);
>  
>  	virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
> -	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
> +	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
>  
>  	rcu_read_unlock();
>  	return len;
> @@ -582,7 +582,7 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
>  	/* Some packets may have been queued before the device was started,
>  	 * let's kick the send worker to send them.
>  	 */
> -	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
> +	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
>  
>  	mutex_unlock(&vsock->dev.mutex);
>  	return 0;
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:13PM -0500, Mike Christie wrote:
> This patch separates the scsi cmd completion code paths so we can complete
> cmds based on their vq instead of having all cmds complete on the same
> worker/CPU. This will be useful with the next patches that allow us to
> create mulitple worker threads and bind them to different vqs, so we can
> have completions running on different threads/CPUs.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> Reviewed-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx>
> ---
>  drivers/vhost/scsi.c | 56 ++++++++++++++++++++------------------------
>  1 file changed, 26 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index 3b0b556c57ef..ecb5cd7450b8 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -167,6 +167,7 @@ MODULE_PARM_DESC(max_io_vqs, "Set the max number of IO virtqueues a vhost scsi d
>  
>  struct vhost_scsi_virtqueue {
>  	struct vhost_virtqueue vq;
> +	struct vhost_scsi *vs;
>  	/*
>  	 * Reference counting for inflight reqs, used for flush operation. At
>  	 * each time, one reference tracks new commands submitted, while we
> @@ -181,6 +182,9 @@ struct vhost_scsi_virtqueue {
>  	struct vhost_scsi_cmd *scsi_cmds;
>  	struct sbitmap scsi_tags;
>  	int max_cmds;
> +
> +	struct vhost_work completion_work;
> +	struct llist_head completion_list;
>  };
>  
>  struct vhost_scsi {
> @@ -190,12 +194,8 @@ struct vhost_scsi {
>  
>  	struct vhost_dev dev;
>  	struct vhost_scsi_virtqueue *vqs;
> -	unsigned long *compl_bitmap;
>  	struct vhost_scsi_inflight **old_inflight;
>  
> -	struct vhost_work vs_completion_work; /* cmd completion work item */
> -	struct llist_head vs_completion_list; /* cmd completion queue */
> -
>  	struct vhost_work vs_event_work; /* evt injection work item */
>  	struct llist_head vs_event_list; /* evt injection queue */
>  
> @@ -368,10 +368,11 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
>  	} else {
>  		struct vhost_scsi_cmd *cmd = container_of(se_cmd,
>  					struct vhost_scsi_cmd, tvc_se_cmd);
> -		struct vhost_scsi *vs = cmd->tvc_vhost;
> +		struct vhost_scsi_virtqueue *svq =  container_of(cmd->tvc_vq,
> +					struct vhost_scsi_virtqueue, vq);
>  
> -		llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list);
> -		vhost_work_queue(&vs->dev, &vs->vs_completion_work);
> +		llist_add(&cmd->tvc_completion_list, &svq->completion_list);
> +		vhost_vq_work_queue(&svq->vq, &svq->completion_work);
>  	}
>  }
>  
> @@ -534,17 +535,17 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
>   */
>  static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
>  {
> -	struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
> -					vs_completion_work);
> +	struct vhost_scsi_virtqueue *svq = container_of(work,
> +				struct vhost_scsi_virtqueue, completion_work);
>  	struct virtio_scsi_cmd_resp v_rsp;
>  	struct vhost_scsi_cmd *cmd, *t;
>  	struct llist_node *llnode;
>  	struct se_cmd *se_cmd;
>  	struct iov_iter iov_iter;
> -	int ret, vq;
> +	bool signal = false;
> +	int ret;
>  
> -	bitmap_zero(vs->compl_bitmap, vs->dev.nvqs);
> -	llnode = llist_del_all(&vs->vs_completion_list);
> +	llnode = llist_del_all(&svq->completion_list);
>  	llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) {
>  		se_cmd = &cmd->tvc_se_cmd;
>  
> @@ -564,21 +565,17 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
>  			      cmd->tvc_in_iovs, sizeof(v_rsp));
>  		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
>  		if (likely(ret == sizeof(v_rsp))) {
> -			struct vhost_scsi_virtqueue *q;
> +			signal = true;
> +
>  			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
> -			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
> -			vq = q - vs->vqs;
> -			__set_bit(vq, vs->compl_bitmap);
>  		} else
>  			pr_err("Faulted on virtio_scsi_cmd_resp\n");
>  
>  		vhost_scsi_release_cmd_res(se_cmd);
>  	}
>  
> -	vq = -1;
> -	while ((vq = find_next_bit(vs->compl_bitmap, vs->dev.nvqs, vq + 1))
> -		< vs->dev.nvqs)
> -		vhost_signal(&vs->dev, &vs->vqs[vq].vq);
> +	if (signal)
> +		vhost_signal(&svq->vs->dev, &svq->vq);
>  }
>  
>  static struct vhost_scsi_cmd *
> @@ -1795,6 +1792,7 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
>  
>  static int vhost_scsi_open(struct inode *inode, struct file *f)
>  {
> +	struct vhost_scsi_virtqueue *svq;
>  	struct vhost_scsi *vs;
>  	struct vhost_virtqueue **vqs;
>  	int r = -ENOMEM, i, nvqs = vhost_scsi_max_io_vqs;
> @@ -1813,10 +1811,6 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	}
>  	nvqs += VHOST_SCSI_VQ_IO;
>  
> -	vs->compl_bitmap = bitmap_alloc(nvqs, GFP_KERNEL);
> -	if (!vs->compl_bitmap)
> -		goto err_compl_bitmap;
> -
>  	vs->old_inflight = kmalloc_array(nvqs, sizeof(*vs->old_inflight),
>  					 GFP_KERNEL | __GFP_ZERO);
>  	if (!vs->old_inflight)
> @@ -1831,7 +1825,6 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	if (!vqs)
>  		goto err_local_vqs;
>  
> -	vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work);
>  	vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work);
>  
>  	vs->vs_events_nr = 0;
> @@ -1842,8 +1835,14 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	vs->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
>  	vs->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
>  	for (i = VHOST_SCSI_VQ_IO; i < nvqs; i++) {
> -		vqs[i] = &vs->vqs[i].vq;
> -		vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
> +		svq = &vs->vqs[i];
> +
> +		vqs[i] = &svq->vq;
> +		svq->vs = vs;
> +		init_llist_head(&svq->completion_list);
> +		vhost_work_init(&svq->completion_work,
> +				vhost_scsi_complete_cmd_work);
> +		svq->vq.handle_kick = vhost_scsi_handle_kick;
>  	}
>  	vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV,
>  		       VHOST_SCSI_WEIGHT, 0, true, NULL);
> @@ -1858,8 +1857,6 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  err_vqs:
>  	kfree(vs->old_inflight);
>  err_inflight:
> -	bitmap_free(vs->compl_bitmap);
> -err_compl_bitmap:
>  	kvfree(vs);
>  err_vs:
>  	return r;
> @@ -1879,7 +1876,6 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
>  	kfree(vs->dev.vqs);
>  	kfree(vs->vqs);
>  	kfree(vs->old_inflight);
> -	bitmap_free(vs->compl_bitmap);
>  	kvfree(vs);
>  	return 0;
>  }
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:14PM -0500, Mike Christie wrote:
> Convert from vhost_work_queue to vhost_vq_work_queue.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/scsi.c | 18 +++++++++---------
>  1 file changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index ecb5cd7450b8..3e86b5fbeca6 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -363,8 +363,9 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
>  	if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) {
>  		struct vhost_scsi_tmf *tmf = container_of(se_cmd,
>  					struct vhost_scsi_tmf, se_cmd);
> +		struct vhost_virtqueue *vq = &tmf->svq->vq;
>  
> -		vhost_work_queue(&tmf->vhost->dev, &tmf->vwork);
> +		vhost_vq_work_queue(vq, &tmf->vwork);
>  	} else {
>  		struct vhost_scsi_cmd *cmd = container_of(se_cmd,
>  					struct vhost_scsi_cmd, tvc_se_cmd);
> @@ -1357,11 +1358,9 @@ static void vhost_scsi_ctl_handle_kick(struct vhost_work *work)
>  }
>  
>  static void
> -vhost_scsi_send_evt(struct vhost_scsi *vs,
> -		   struct vhost_scsi_tpg *tpg,
> -		   struct se_lun *lun,
> -		   u32 event,
> -		   u32 reason)
> +vhost_scsi_send_evt(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
> +		    struct vhost_scsi_tpg *tpg, struct se_lun *lun,
> +		    u32 event, u32 reason)
>  {
>  	struct vhost_scsi_evt *evt;
>  
> @@ -1383,7 +1382,7 @@ vhost_scsi_send_evt(struct vhost_scsi *vs,
>  	}
>  
>  	llist_add(&evt->list, &vs->vs_event_list);
> -	vhost_work_queue(&vs->dev, &vs->vs_event_work);
> +	vhost_vq_work_queue(vq, &vs->vs_event_work);
>  }
>  
>  static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
> @@ -1397,7 +1396,8 @@ static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
>  		goto out;
>  
>  	if (vs->vs_events_missed)
> -		vhost_scsi_send_evt(vs, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
> +		vhost_scsi_send_evt(vs, vq, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT,
> +				    0);
>  out:
>  	mutex_unlock(&vq->mutex);
>  }
> @@ -2016,7 +2016,7 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg,
>  		goto unlock;
>  
>  	if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG))
> -		vhost_scsi_send_evt(vs, tpg, lun,
> +		vhost_scsi_send_evt(vs, vq, tpg, lun,
>  				   VIRTIO_SCSI_T_TRANSPORT_RESET, reason);
>  unlock:
>  	mutex_unlock(&vq->mutex);
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:15PM -0500, Mike Christie wrote:
> vhost_work_queue is no longer used. Each driver is using the poll or vq
> based queueing, so remove vhost_work_queue.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vhost.c | 6 ------
>  drivers/vhost/vhost.h | 1 -
>  2 files changed, 7 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6968f8fc17e8..f812daf25648 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -263,12 +263,6 @@ static void vhost_work_flush_on(struct vhost_worker *worker)
>  	wait_for_completion(&flush.wait_event);
>  }
>  
> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
> -{
> -	vhost_work_queue_on(dev->worker, work);
> -}
> -EXPORT_SYMBOL_GPL(vhost_work_queue);
> -
>  void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
>  {
>  	vhost_work_queue_on(vq->worker, work);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d9b8abbe3a26..ef55fae2517c 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -45,7 +45,6 @@ struct vhost_poll {
>  };
>  
>  void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
>  
>  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
>  		     __poll_t mask, struct vhost_dev *dev,
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:16PM -0500, Mike Christie wrote:
> With one worker we will always send the scsi cmd responses then send the
> TMF rsp, because LIO will always complete the scsi cmds first then call
> into us to send the TMF response.
> 
> With multiple workers, the IO vq workers could be running while the
> TMF/ctl vq worker is so this has us do a flush before completing the TMF
> to make sure cmds are completed when it's work is later queued and run.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/scsi.c  | 22 +++++++++++++++++++---
>  drivers/vhost/vhost.c |  6 ++++++
>  drivers/vhost/vhost.h |  1 +
>  3 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index 3e86b5fbeca6..48dba4fe2dac 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -1158,12 +1158,28 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work *work)
>  {
>  	struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf,
>  						  vwork);
> -	int resp_code;
> +	struct vhost_virtqueue *ctl_vq, *vq;
> +	int resp_code, i;
> +
> +	if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) {
> +		/*
> +		 * Flush IO vqs that don't share a worker with the ctl to make
> +		 * sure they have sent their responses before us.
> +		 */
> +		ctl_vq = &tmf->vhost->vqs[VHOST_SCSI_VQ_CTL].vq;
> +		for (i = VHOST_SCSI_VQ_IO; i < tmf->vhost->dev.nvqs; i++) {
> +			vq = &tmf->vhost->vqs[i].vq;
> +
> +			if (vhost_vq_is_setup(vq) &&
> +			    vq->worker != ctl_vq->worker) {
> +				vhost_vq_flush(vq);
> +			}
> +		}
>  
> -	if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE)
>  		resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
> -	else
> +	} else {
>  		resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED;
> +	}
>  
>  	vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs,
>  				 tmf->vq_desc, &tmf->resp_iov, resp_code);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f812daf25648..1fa5e9a49092 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -275,6 +275,12 @@ void vhost_dev_flush(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_flush);
>  
> +void vhost_vq_flush(struct vhost_virtqueue *vq)
> +{
> +	vhost_work_flush_on(vq->worker);
> +}
> +EXPORT_SYMBOL_GPL(vhost_vq_flush);
> +
>  /* A lockless hint for busy polling code to exit the loop */
>  bool vhost_vq_has_work(struct vhost_virtqueue *vq)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index ef55fae2517c..395707c680e5 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -53,6 +53,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file);
>  void vhost_poll_stop(struct vhost_poll *poll);
>  void vhost_poll_queue(struct vhost_poll *poll);
>  void vhost_dev_flush(struct vhost_dev *dev);
> +void vhost_vq_flush(struct vhost_virtqueue *vq);
>  
>  struct vhost_log {
>  	u64 addr;
> -- 
> 2.25.1

On Mon, Mar 27, 2023 at 09:17:17PM -0500, Mike Christie wrote:
> For vhost-scsi with 3 vqs and a workload like that tries to use those vqs
> like:
> 
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128  --numjobs=3
> 
> the single vhost worker thread will become a bottlneck and we are stuck
> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
> used.
> 
> To better utilize virtqueues and available CPUs, this patch allows
> userspace to create workers and bind them to vqs. You can have N workers
> per dev and also share N workers with M vqs.
> 
> With the patches and doing a worker per vq, we can scale to at least
> 16 vCPUs/vqs (that's my system limit) with the same command fio command
> above with numjobs=16:
> 
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=64  --numjobs=16
> 
> which gives around 2326K IOPs.
> 
> Note that for testing I dropped depth to 64 above because the vhost/virt
> layer supports only 1024 total commands per device. And the only tuning I
> did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO
> path which becomes an issue at around 12 jobs/virtqueues.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vhost.c            | 177 ++++++++++++++++++++++++++++---
>  drivers/vhost/vhost.h            |   4 +-
>  include/uapi/linux/vhost.h       |  22 ++++
>  include/uapi/linux/vhost_types.h |  15 +++
>  4 files changed, 204 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 1fa5e9a49092..e40699e83c6d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -271,7 +271,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
>  
>  void vhost_dev_flush(struct vhost_dev *dev)
>  {
> -	vhost_work_flush_on(dev->worker);
> +	struct vhost_worker *worker;
> +	unsigned long i;
> +
> +	xa_for_each(&dev->worker_xa, i, worker)
> +		vhost_work_flush_on(worker);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_flush);
>  
> @@ -489,7 +493,6 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	dev->umem = NULL;
>  	dev->iotlb = NULL;
>  	dev->mm = NULL;
> -	dev->worker = NULL;
>  	dev->iov_limit = iov_limit;
>  	dev->weight = weight;
>  	dev->byte_weight = byte_weight;
> @@ -499,7 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -
> +	xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -562,32 +565,67 @@ static void vhost_detach_mm(struct vhost_dev *dev)
>  	dev->mm = NULL;
>  }
>  
> -static void vhost_worker_free(struct vhost_dev *dev)
> +static void vhost_worker_put(struct vhost_dev *dev, struct vhost_worker *worker)
>  {
> -	struct vhost_worker *worker = dev->worker;
> -
>  	if (!worker)
>  		return;
>  
> -	dev->worker = NULL;
> +	if (!refcount_dec_and_test(&worker->refcount))
> +		return;
> +
>  	WARN_ON(!llist_empty(&worker->work_list));
>  	vhost_task_stop(worker->vtsk);
>  	kfree(worker);
>  }
>  
> +static void vhost_vq_detach_worker(struct vhost_virtqueue *vq)
> +{
> +	if (vq->worker)
> +		vhost_worker_put(vq->dev, vq->worker);
> +	vq->worker = NULL;
> +}
> +
> +static void vhost_workers_free(struct vhost_dev *dev)
> +{
> +	struct vhost_worker *worker;
> +	unsigned long i;
> +
> +	if (!dev->use_worker)
> +		return;
> +
> +	for (i = 0; i < dev->nvqs; i++)
> +		vhost_vq_detach_worker(dev->vqs[i]);
> +	/*
> +	 * Drop the refcount taken during allocation, and handle the default
> +	 * worker and the cases where userspace might have crashed or was lazy
> +	 * and did a VHOST_NEW_WORKER but not a VHOST_FREE_WORKER.
> +	 */
> +	xa_for_each(&dev->worker_xa, i, worker) {
> +		xa_erase(&dev->worker_xa, worker->id);
> +		vhost_worker_put(dev, worker);
> +	}
> +	xa_destroy(&dev->worker_xa);
> +}
> +
>  static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>  {
>  	struct vhost_worker *worker;
>  	struct vhost_task *vtsk;
>  	char name[TASK_COMM_LEN];
> +	int ret;
> +	u32 id;
>  
>  	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
>  	if (!worker)
>  		return NULL;
>  
> -	dev->worker = worker;
>  	worker->kcov_handle = kcov_common_handle();
>  	init_llist_head(&worker->work_list);
> +	/*
> +	 * We increase the refcount for the initial creation and then
> +	 * later each time it's attached to a virtqueue.
> +	 */
> +	refcount_set(&worker->refcount, 1);
>  	snprintf(name, sizeof(name), "vhost-%d", current->pid);
>  
>  	vtsk = vhost_task_create(vhost_worker, worker, name);
> @@ -596,14 +634,104 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>  
>  	worker->vtsk = vtsk;
>  	vhost_task_start(vtsk);
> +
> +	ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
> +	if (ret < 0)
> +		goto stop_worker;
> +	worker->id = id;
> +
>  	return worker;
>  
> +stop_worker:
> +	vhost_task_stop(vtsk);
>  free_worker:
>  	kfree(worker);
> -	dev->worker = NULL;
>  	return NULL;
>  }
>  
> +/* Caller must have device and virtqueue mutex */
> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +				     struct vhost_worker *worker)
> +{
> +	refcount_inc(&worker->refcount);
> +	vhost_vq_detach_worker(vq);
> +	vq->worker = worker;
> +}
> +
> +/* Caller must have device and virtqueue mutex */
> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +				  struct vhost_vring_worker *info)
> +{
> +	unsigned long index = info->worker_id;
> +	struct vhost_dev *dev = vq->dev;
> +	struct vhost_worker *worker;
> +
> +	if (!dev->use_worker)
> +		return -EINVAL;
> +
> +	/*
> +	 * We don't support setting a worker on an active vq to make flushing
> +	 * and removal simple.
> +	 */
> +	if (vhost_vq_get_backend(vq))
> +		return -EBUSY;
> +
> +	worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +	if (!worker || worker->id != info->worker_id)
> +		return -ENODEV;
> +
> +	__vhost_vq_attach_worker(vq, worker);
> +	return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_new_worker(struct vhost_dev *dev,
> +			    struct vhost_worker_state *info)
> +{
> +	struct vhost_worker *worker;
> +
> +	if (!dev->use_worker)
> +		return -EINVAL;
> +
> +	worker = vhost_worker_create(dev);
> +	if (!worker)
> +		return -ENOMEM;
> +
> +	info->worker_id = worker->id;
> +	return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_free_worker(struct vhost_dev *dev,
> +			     struct vhost_worker_state *info)
> +{
> +	unsigned long index = info->worker_id;
> +	struct vhost_worker *worker;
> +
> +	if (!dev->use_worker)
> +		return -EINVAL;
> +
> +	worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +	if (!worker || worker->id != info->worker_id)
> +		return -ENODEV;
> +
> +	/*
> +	 * We can free the worker if it's not attached to any virtqueues.
> +	 */
> +	if (refcount_read(&worker->refcount) != 1)
> +		return -EBUSY;
> +
> +	xa_erase(&dev->worker_xa, worker->id);
> +	/*
> +	 * Make sure if there was a flush that saw the worker in the XA that
> +	 * it has completed.
> +	 */
> +	vhost_work_flush_on(worker);
> +
> +	vhost_worker_put(dev, worker);
> +	return 0;
> +}
> +
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> @@ -624,7 +752,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  			goto err_worker;
>  
>  		for (i = 0; i < dev->nvqs; i++)
> -			dev->vqs[i]->worker = worker;
> +			__vhost_vq_attach_worker(dev->vqs[i], worker);
>  	}
>  
>  	err = vhost_dev_alloc_iovecs(dev);
> @@ -633,7 +761,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  
>  	return 0;
>  err_iovecs:
> -	vhost_worker_free(dev);
> +	vhost_workers_free(dev);
>  err_worker:
>  	vhost_detach_mm(dev);
>  err_mm:
> @@ -726,7 +854,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  	dev->iotlb = NULL;
>  	vhost_clear_msg(dev);
>  	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
> -	vhost_worker_free(dev);
> +	vhost_workers_free(dev);
>  	vhost_detach_mm(dev);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -1616,6 +1744,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>  	struct eventfd_ctx *ctx = NULL;
>  	u32 __user *idxp = argp;
>  	struct vhost_virtqueue *vq;
> +	struct vhost_vring_worker w;
>  	struct vhost_vring_state s;
>  	struct vhost_vring_file f;
>  	u32 idx;
> @@ -1723,7 +1852,16 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>  		if (copy_to_user(argp, &s, sizeof(s)))
>  			r = -EFAULT;
>  		break;
> -	default:
> +	case VHOST_ATTACH_VRING_WORKER:
> +		if (copy_from_user(&w, argp, sizeof(w))) {
> +			r = -EFAULT;
> +			break;
> +		}
> +		r = vhost_vq_attach_worker(vq, &w);
> +		if (!r && copy_to_user(argp, &w, sizeof(w)))
> +			r = -EFAULT;
> +		break;
> +default:
>  		r = -ENOIOCTLCMD;
>  	}
>  
> @@ -1776,6 +1914,7 @@ EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
>  /* Caller must have device mutex */
>  long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>  {
> +	struct vhost_worker_state w;
>  	struct eventfd_ctx *ctx;
>  	u64 p;
>  	long r;
> @@ -1836,6 +1975,18 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>  		if (ctx)
>  			eventfd_ctx_put(ctx);
>  		break;
> +	case VHOST_NEW_WORKER:
> +		r = vhost_new_worker(d, &w);
> +		if (!r && copy_to_user(argp, &w, sizeof(w)))
> +			r = -EFAULT;
> +		break;
> +	case VHOST_FREE_WORKER:
> +		if (copy_from_user(&w, argp, sizeof(w))) {
> +			r = -EFAULT;
> +			break;
> +		}
> +		r = vhost_free_worker(d, &w);
> +		break;
>  	default:
>  		r = -ENOIOCTLCMD;
>  		break;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 395707c680e5..a67ae8293c38 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -30,6 +30,8 @@ struct vhost_worker {
>  	struct vhost_task	*vtsk;
>  	struct llist_head	work_list;
>  	u64			kcov_handle;
> +	refcount_t		refcount;
> +	u32			id;
>  };
>  
>  /* Poll a file (eventfd or socket) */
> @@ -156,7 +158,6 @@ struct vhost_dev {
>  	struct vhost_virtqueue **vqs;
>  	int nvqs;
>  	struct eventfd_ctx *log_ctx;
> -	struct vhost_worker *worker;
>  	struct vhost_iotlb *umem;
>  	struct vhost_iotlb *iotlb;
>  	spinlock_t iotlb_lock;
> @@ -166,6 +167,7 @@ struct vhost_dev {
>  	int iov_limit;
>  	int weight;
>  	int byte_weight;
> +	struct xarray worker_xa;
>  	bool use_worker;
>  	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
>  			   struct vhost_iotlb_msg *msg);
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 92e1b700b51c..7329e7f349dd 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -45,6 +45,23 @@
>  #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
>  /* Specify an eventfd file descriptor to signal on log write. */
>  #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +/* By default, a device gets one vhost_worker that its virtqueues share. This
> + * command allows the owner of the device to create an additional vhost_worker
> + * for the device. It can later be bound to 1 or more of its virtqueues using
> + * the VHOST_ATTACH_VRING_WORKER command.
> + *
> + * This must be called after VHOST_SET_OWNER and the caller must be the owner
> + * of the device. The new thread will inherit caller's cgroups and namespaces,
> + * and will share the caller's memory space. The new thread will also be
> + * counted against the caller's RLIMIT_NPROC value.
> + */
> +#define VHOST_NEW_WORKER _IOW(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
> +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
> + * virtqueue. If userspace is not able to call this for workers its created,
> + * the kernel will free all the device's workers when the device is closed and
> + * the last reference to the device has been released.
> + */
> +#define VHOST_FREE_WORKER _IOR(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
>  
>  /* Ring setup. */
>  /* Set number of descriptors in ring. This parameter can not
> @@ -70,6 +87,11 @@
>  #define VHOST_VRING_BIG_ENDIAN 1
>  #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
>  #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
> +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
> + * virtqueues. This must be done before the virtqueue is active.
> + */
> +#define VHOST_ATTACH_VRING_WORKER _IOR(VHOST_VIRTIO, 0x15,		\
> +				       struct vhost_vring_worker)
>  
>  /* The following ioctls use eventfd file descriptors to signal and poll
>   * for events. */
> diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
> index c5690a8992d8..ad0fe2e721be 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -47,6 +47,21 @@ struct vhost_vring_addr {
>  	__u64 log_guest_addr;
>  };
>  
> +struct vhost_worker_state {
> +	/*
> +	 * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
> +	 * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
> +	 * to free.
> +	 */
> +	int worker_id;
> +};
> +
> +struct vhost_vring_worker {
> +	unsigned int index;
> +	/* The id of the vhost_worker returned from VHOST_NEW_WORKER */
> +	int worker_id;
> +};
> +
>  /* no alignment requirement */
>  struct vhost_iotlb_msg {
>  	__u64 iova;
> -- 
> 2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization