Re: [PATCH v6 11/11] vhost: allow userspace to create workers

Jason Wang <jasowang@xxxxxxxxxx> · Tue, 4 Apr 2023 16:00:59 +0800

On Tue, Mar 28, 2023 at 10:17 AM Mike Christie
<michael.christie@xxxxxxxxxx> wrote:
>
> For vhost-scsi with 3 vqs and a workload like that tries to use those vqs
> like:
>
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128  --numjobs=3
>
> the single vhost worker thread will become a bottlneck and we are stuck
> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
> used.
>
> To better utilize virtqueues and available CPUs, this patch allows
> userspace to create workers and bind them to vqs. You can have N workers
> per dev and also share N workers with M vqs.
>
> With the patches and doing a worker per vq, we can scale to at least
> 16 vCPUs/vqs (that's my system limit) with the same command fio command
> above with numjobs=16:
>
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=64  --numjobs=16
>
> which gives around 2326K IOPs.
>
> Note that for testing I dropped depth to 64 above because the vhost/virt
> layer supports only 1024 total commands per device. And the only tuning I
> did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO
> path which becomes an issue at around 12 jobs/virtqueues.
>
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vhost.c            | 177 ++++++++++++++++++++++++++++---
>  drivers/vhost/vhost.h            |   4 +-
>  include/uapi/linux/vhost.h       |  22 ++++
>  include/uapi/linux/vhost_types.h |  15 +++
>  4 files changed, 204 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 1fa5e9a49092..e40699e83c6d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -271,7 +271,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
>
>  void vhost_dev_flush(struct vhost_dev *dev)
>  {
> -       vhost_work_flush_on(dev->worker);
> +       struct vhost_worker *worker;
> +       unsigned long i;
> +
> +       xa_for_each(&dev->worker_xa, i, worker)
> +               vhost_work_flush_on(worker);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_flush);
>
> @@ -489,7 +493,6 @@ void vhost_dev_init(struct vhost_dev *dev,
>         dev->umem = NULL;
>         dev->iotlb = NULL;
>         dev->mm = NULL;
> -       dev->worker = NULL;
>         dev->iov_limit = iov_limit;
>         dev->weight = weight;
>         dev->byte_weight = byte_weight;
> @@ -499,7 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>         INIT_LIST_HEAD(&dev->read_list);
>         INIT_LIST_HEAD(&dev->pending_list);
>         spin_lock_init(&dev->iotlb_lock);
> -
> +       xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
>
>         for (i = 0; i < dev->nvqs; ++i) {
>                 vq = dev->vqs[i];
> @@ -562,32 +565,67 @@ static void vhost_detach_mm(struct vhost_dev *dev)
>         dev->mm = NULL;
>  }
>
> -static void vhost_worker_free(struct vhost_dev *dev)
> +static void vhost_worker_put(struct vhost_dev *dev, struct vhost_worker *worker)
>  {
> -       struct vhost_worker *worker = dev->worker;
> -
>         if (!worker)
>                 return;
>
> -       dev->worker = NULL;
> +       if (!refcount_dec_and_test(&worker->refcount))
> +               return;
> +
>         WARN_ON(!llist_empty(&worker->work_list));
>         vhost_task_stop(worker->vtsk);
>         kfree(worker);
>  }
>
> +static void vhost_vq_detach_worker(struct vhost_virtqueue *vq)
> +{
> +       if (vq->worker)

What happens to the pending work that queues for the old worker?

> +               vhost_worker_put(vq->dev, vq->worker);
> +       vq->worker = NULL;
> +}
> +
> +static void vhost_workers_free(struct vhost_dev *dev)
> +{
> +       struct vhost_worker *worker;
> +       unsigned long i;
> +
> +       if (!dev->use_worker)
> +               return;
> +
> +       for (i = 0; i < dev->nvqs; i++)
> +               vhost_vq_detach_worker(dev->vqs[i]);
> +       /*
> +        * Drop the refcount taken during allocation, and handle the default
> +        * worker and the cases where userspace might have crashed or was lazy
> +        * and did a VHOST_NEW_WORKER but not a VHOST_FREE_WORKER.
> +        */
> +       xa_for_each(&dev->worker_xa, i, worker) {
> +               xa_erase(&dev->worker_xa, worker->id);
> +               vhost_worker_put(dev, worker);
> +       }
> +       xa_destroy(&dev->worker_xa);
> +}
> +
>  static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>  {
>         struct vhost_worker *worker;
>         struct vhost_task *vtsk;
>         char name[TASK_COMM_LEN];
> +       int ret;
> +       u32 id;
>
>         worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
>         if (!worker)
>                 return NULL;
>
> -       dev->worker = worker;
>         worker->kcov_handle = kcov_common_handle();
>         init_llist_head(&worker->work_list);
> +       /*
> +        * We increase the refcount for the initial creation and then
> +        * later each time it's attached to a virtqueue.
> +        */
> +       refcount_set(&worker->refcount, 1);
>         snprintf(name, sizeof(name), "vhost-%d", current->pid);
>
>         vtsk = vhost_task_create(vhost_worker, worker, name);
> @@ -596,14 +634,104 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>
>         worker->vtsk = vtsk;
>         vhost_task_start(vtsk);
> +
> +       ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
> +       if (ret < 0)
> +               goto stop_worker;
> +       worker->id = id;
> +
>         return worker;
>
> +stop_worker:
> +       vhost_task_stop(vtsk);
>  free_worker:
>         kfree(worker);
> -       dev->worker = NULL;
>         return NULL;
>  }
>
> +/* Caller must have device and virtqueue mutex */
> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +                                    struct vhost_worker *worker)
> +{
> +       refcount_inc(&worker->refcount);
> +       vhost_vq_detach_worker(vq);())
> +       vq->worker = worker;

What happens if there's a pending flush in a specific device (e.g in
vhost_scsi_tmf_resp_work())?

Does this mean we need to hold vq mutex when doing the flush? (which
seems not the case of vhost_scsi_tmf_resp_work()).

> +}
> +
> +/* Caller must have device and virtqueue mutex */
> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +                                 struct vhost_vring_worker *info)
> +{
> +       unsigned long index = info->worker_id;
> +       struct vhost_dev *dev = vq->dev;
> +       struct vhost_worker *worker;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       /*
> +        * We don't support setting a worker on an active vq to make flushing
> +        * and removal simple.
> +        */
> +       if (vhost_vq_get_backend(vq))
> +               return -EBUSY;

This assumes the worker won't be started until the backend is set
which is not true.

> +
> +       worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +       if (!worker || worker->id != info->worker_id)
> +               return -ENODEV;
> +
> +       __vhost_vq_attach_worker(vq, worker);
> +       return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_new_worker(struct vhost_dev *dev,
> +                           struct vhost_worker_state *info)
> +{
> +       struct vhost_worker *worker;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       worker = vhost_worker_create(dev);
> +       if (!worker)
> +               return -ENOMEM;
> +
> +       info->worker_id = worker->id;
> +       return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_free_worker(struct vhost_dev *dev,
> +                            struct vhost_worker_state *info)
> +{
> +       unsigned long index = info->worker_id;
> +       struct vhost_worker *worker;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);

So we use int for worker_id which conflicts with UINT_MAX here?

struct vhost_worker_state {
        /*
         * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
         * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
         * to free.
         */
        int worker_id;
};

A side effect of using xa_index directly is that userspace can guess
the xa_index of the default worker and free it here, is this intended?
Should we hide the default worker from xa?

> +       if (!worker || worker->id != info->worker_id)
> +               return -ENODEV;
> +
> +       /*
> +        * We can free the worker if it's not attached to any virtqueues.
> +        */
> +       if (refcount_read(&worker->refcount) != 1)
> +               return -EBUSY;
> +
> +       xa_erase(&dev->worker_xa, worker->id);
> +       /*
> +        * Make sure if there was a flush that saw the worker in the XA that
> +        * it has completed.
> +        */
> +       vhost_work_flush_on(worker);
> +
> +       vhost_worker_put(dev, worker);
> +       return 0;
> +}
> +
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> @@ -624,7 +752,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>                         goto err_worker;
>
>                 for (i = 0; i < dev->nvqs; i++)
> -                       dev->vqs[i]->worker = worker;
> +                       __vhost_vq_attach_worker(dev->vqs[i], worker);
>         }
>
>         err = vhost_dev_alloc_iovecs(dev);
> @@ -633,7 +761,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>
>         return 0;
>  err_iovecs:
> -       vhost_worker_free(dev);
> +       vhost_workers_free(dev);
>  err_worker:
>         vhost_detach_mm(dev);
>  err_mm:
> @@ -726,7 +854,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>         dev->iotlb = NULL;
>         vhost_clear_msg(dev);
>         wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
> -       vhost_worker_free(dev);
> +       vhost_workers_free(dev);
>         vhost_detach_mm(dev);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -1616,6 +1744,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>         struct eventfd_ctx *ctx = NULL;
>         u32 __user *idxp = argp;
>         struct vhost_virtqueue *vq;
> +       struct vhost_vring_worker w;
>         struct vhost_vring_state s;
>         struct vhost_vring_file f;
>         u32 idx;
> @@ -1723,7 +1852,16 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>                 if (copy_to_user(argp, &s, sizeof(s)))
>                         r = -EFAULT;
>                 break;
> -       default:
> +       case VHOST_ATTACH_VRING_WORKER:
> +               if (copy_from_user(&w, argp, sizeof(w))) {
> +                       r = -EFAULT;
> +                       break;
> +               }
> +               r = vhost_vq_attach_worker(vq, &w);
> +               if (!r && copy_to_user(argp, &w, sizeof(w)))
> +                       r = -EFAULT;
> +               break;

It's a little odd that we have new and attach but only a free.

Thanks

> +default:
>                 r = -ENOIOCTLCMD;
>         }
>
> @@ -1776,6 +1914,7 @@ EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
>  /* Caller must have device mutex */
>  long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>  {
> +       struct vhost_worker_state w;
>         struct eventfd_ctx *ctx;
>         u64 p;
>         long r;
> @@ -1836,6 +1975,18 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>                 if (ctx)
>                         eventfd_ctx_put(ctx);
>                 break;
> +       case VHOST_NEW_WORKER:
> +               r = vhost_new_worker(d, &w);
> +               if (!r && copy_to_user(argp, &w, sizeof(w)))
> +                       r = -EFAULT;
> +               break;
> +       case VHOST_FREE_WORKER:
> +               if (copy_from_user(&w, argp, sizeof(w))) {
> +                       r = -EFAULT;
> +                       break;
> +               }
> +               r = vhost_free_worker(d, &w);
> +               break;
>         default:
>                 r = -ENOIOCTLCMD;
>                 break;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 395707c680e5..a67ae8293c38 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -30,6 +30,8 @@ struct vhost_worker {
>         struct vhost_task       *vtsk;
>         struct llist_head       work_list;
>         u64                     kcov_handle;
> +       refcount_t              refcount;
> +       u32                     id;
>  };
>
>  /* Poll a file (eventfd or socket) */
> @@ -156,7 +158,6 @@ struct vhost_dev {
>         struct vhost_virtqueue **vqs;
>         int nvqs;
>         struct eventfd_ctx *log_ctx;
> -       struct vhost_worker *worker;
>         struct vhost_iotlb *umem;
>         struct vhost_iotlb *iotlb;
>         spinlock_t iotlb_lock;
> @@ -166,6 +167,7 @@ struct vhost_dev {
>         int iov_limit;
>         int weight;
>         int byte_weight;
> +       struct xarray worker_xa;
>         bool use_worker;
>         int (*msg_handler)(struct vhost_dev *dev, u32 asid,
>                            struct vhost_iotlb_msg *msg);
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 92e1b700b51c..7329e7f349dd 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -45,6 +45,23 @@
>  #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
>  /* Specify an eventfd file descriptor to signal on log write. */
>  #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +/* By default, a device gets one vhost_worker that its virtqueues share. This
> + * command allows the owner of the device to create an additional vhost_worker
> + * for the device. It can later be bound to 1 or more of its virtqueues using
> + * the VHOST_ATTACH_VRING_WORKER command.
> + *
> + * This must be called after VHOST_SET_OWNER and the caller must be the owner
> + * of the device. The new thread will inherit caller's cgroups and namespaces,
> + * and will share the caller's memory space. The new thread will also be
> + * counted against the caller's RLIMIT_NPROC value.
> + */
> +#define VHOST_NEW_WORKER _IOW(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
> +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
> + * virtqueue. If userspace is not able to call this for workers its created,
> + * the kernel will free all the device's workers when the device is closed and
> + * the last reference to the device has been released.
> + */
> +#define VHOST_FREE_WORKER _IOR(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
>
>  /* Ring setup. */
>  /* Set number of descriptors in ring. This parameter can not
> @@ -70,6 +87,11 @@
>  #define VHOST_VRING_BIG_ENDIAN 1
>  #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
>  #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
> +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
> + * virtqueues. This must be done before the virtqueue is active.
> + */
> +#define VHOST_ATTACH_VRING_WORKER _IOR(VHOST_VIRTIO, 0x15,             \
> +                                      struct vhost_vring_worker)
>
>  /* The following ioctls use eventfd file descriptors to signal and poll
>   * for events. */
> diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
> index c5690a8992d8..ad0fe2e721be 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -47,6 +47,21 @@ struct vhost_vring_addr {
>         __u64 log_guest_addr;
>  };
>
> +struct vhost_worker_state {
> +       /*
> +        * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
> +        * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
> +        * to free.
> +        */
> +       int worker_id;
> +};
> +
> +struct vhost_vring_worker {
> +       unsigned int index;
> +       /* The id of the vhost_worker returned from VHOST_NEW_WORKER */
> +       int worker_id;
> +};
> +
>  /* no alignment requirement */
>  struct vhost_iotlb_msg {
>         __u64 iova;
> --
> 2.25.1
>

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization