Re: [PATCH 13/14] vhost: allow userspace to create workers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Apr 29, 2023 at 12:32 AM Mike Christie
<michael.christie@xxxxxxxxxx> wrote:
>
> For vhost-scsi with 3 vqs or more and a workload that tries to use
> them in parallel like:
>
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128  --numjobs=3
>
> the single vhost worker thread will become a bottlneck and we are stuck
> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
> used.
>
> To better utilize virtqueues and available CPUs, this patch allows
> userspace to create workers and bind them to vqs. You can have N workers
> per dev and also share N workers with M vqs on that dev.
>
> This patch adds the interface related code and the next patch will hook
> vhost-scsi into it. The patches do not try to hook net and vsock into
> the interface because:
>
> 1. multiple workers don't seem to help vsock. The problem is that with
> only 2 virtqueues we never fully use the existing worker when doing
> bidirectional tests. This seems to match vhost-scsi where we don't see
> the worker as a bottleneck until 3 virtqueues are used.
>
> 2. net already has a way to use multiple workers.
>
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
>  drivers/vhost/vhost.c            | 145 ++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h            |   3 +
>  include/uapi/linux/vhost.h       |  33 +++++++
>  include/uapi/linux/vhost_types.h |  16 ++++
>  4 files changed, 196 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 4b0b82292379..e8f829f35814 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>         return NULL;
>  }
>
> +/* Caller must have device mutex */
> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +                                    struct vhost_worker *worker)
> +{
> +       if (vq->worker)
> +               vq->worker->attachment_cnt--;
> +       worker->attachment_cnt++;
> +       vq->worker = worker;
> +}
> +
> +/**
> + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command
> + * @vq: the virtqueue we will set the worker for
> + * @info: the worker userspace has requested us to use
> + *
> + * We only allow userspace to set a virtqueue's worker if it's not active and
> + * polling is not enabled.

I wonder if we can mandate this in the code like check the vq backend
in vhost_vq_work_queue().

 We also assume drivers supporting this will not be
> + * internally queueing works directly or via calls like vhost_dev_flush at
> + * this time.
> + *
> + * Caller must have device and virtqueue mutex.
> + */
> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +                                 struct vhost_vring_worker *info)
> +{
> +       unsigned long index = info->worker_id;
> +       struct vhost_dev *dev = vq->dev;
> +       struct vhost_worker *worker;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       if (vhost_vq_get_backend(vq) || vq->kick)

It might be worthwhile to have a comment to explain why we need to
check vq->kick here.

This also means the device should not queue work when the backend is NULL.

But I found it is probably not the case for vsock, it calls
vhost_poll_queue() in vhost_transport_cancel_pkt() but
vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq,
NULL);

Net seems to be fine since it waits for ubufs to be completed in
vhost_net_set_backend().

Can we make things easier by migrating the work_list? I also worry if
there are other corner cases which makes me think how hard it is if we
can just support those ioctls after the backend is set?


> +               return -EBUSY;
> +
> +       worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +       if (!worker || worker->id != info->worker_id)
> +               return -ENODEV;
> +
> +       __vhost_vq_attach_worker(vq, worker);
> +       return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_new_worker(struct vhost_dev *dev,
> +                           struct vhost_worker_state *info)
> +{
> +       struct vhost_worker *worker;
> +
> +       worker = vhost_worker_create(dev);
> +       if (!worker)
> +               return -ENOMEM;
> +
> +       info->worker_id = worker->id;
> +       return 0;
> +}
> +
> +static int vhost_free_worker(struct vhost_dev *dev,
> +                            struct vhost_worker_state *info)
> +{
> +       unsigned long index = info->worker_id;
> +       struct vhost_worker *worker;
> +
> +       worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +       if (!worker || worker->id != info->worker_id)
> +               return -ENODEV;
> +
> +       if (worker->attachment_cnt)
> +               return -EBUSY;
> +
> +       vhost_worker_destroy(dev, worker);
> +       return 0;
> +}
> +
>  static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
>                                   struct vhost_virtqueue **vq, u32 *id)
>  {
> @@ -651,6 +725,75 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
>         return 0;
>  }
>
> +/* Caller must have device mutex */
> +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
> +                       void __user *argp)
> +{
> +       struct vhost_vring_worker ring_worker;
> +       struct vhost_worker_state state;
> +       struct vhost_virtqueue *vq;
> +       long ret;
> +       u32 idx;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       if (!vhost_dev_has_owner(dev))
> +               return -EINVAL;
> +
> +       switch (ioctl) {
> +       /* dev worker ioctls */
> +       case VHOST_NEW_WORKER:
> +               ret = vhost_new_worker(dev, &state);
> +               if (!ret && copy_to_user(argp, &state, sizeof(state)))
> +                       ret = -EFAULT;
> +               return ret;
> +       case VHOST_FREE_WORKER:
> +               if (copy_from_user(&state, argp, sizeof(state)))
> +                       return -EFAULT;
> +               return vhost_free_worker(dev, &state);
> +       /* vring worker ioctls */
> +       case VHOST_ATTACH_VRING_WORKER:
> +       case VHOST_GET_VRING_WORKER:
> +               break;
> +       default:
> +               return -ENOIOCTLCMD;
> +       }
> +
> +       ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
> +       if (ret)
> +               return ret;
> +
> +       mutex_lock(&vq->mutex);
> +       switch (ioctl) {
> +       case VHOST_ATTACH_VRING_WORKER:
> +               if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = vhost_vq_attach_worker(vq, &ring_worker);
> +               if (!ret && copy_to_user(argp, &ring_worker,
> +                                        sizeof(ring_worker)))
> +                       ret = -EFAULT;
> +               break;
> +       case VHOST_GET_VRING_WORKER:
> +               ring_worker.index = idx;
> +               ring_worker.worker_id = vq->worker->id;
> +
> +               if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
> +                       ret = -EFAULT;
> +               break;
> +       default:
> +               ret = -ENOIOCTLCMD;
> +               break;
> +       }
> +
> +       mutex_unlock(&vq->mutex);
> +       return ret;
> +}
> +EXPORT_SYMBOL_GPL(vhost_worker_ioctl);
> +
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> @@ -671,7 +814,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>                         goto err_worker;
>
>                 for (i = 0; i < dev->nvqs; i++)
> -                       dev->vqs[i]->worker = worker;
> +                       __vhost_vq_attach_worker(dev->vqs[i], worker);
>         }
>
>         err = vhost_dev_alloc_iovecs(dev);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 2eea20d54134..bcb33a2f64f0 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -31,6 +31,7 @@ struct vhost_worker {
>         struct llist_head       work_list;
>         u64                     kcov_handle;
>         u32                     id;
> +       int                     attachment_cnt;
>  };
>
>  /* Poll a file (eventfd or socket) */
> @@ -187,6 +188,8 @@ void vhost_dev_cleanup(struct vhost_dev *);
>  void vhost_dev_stop(struct vhost_dev *);
>  long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
>  long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
> +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
> +                       void __user *argp);
>  bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  bool vhost_log_access_ok(struct vhost_dev *);
>  void vhost_clear_msg(struct vhost_dev *dev);
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 92e1b700b51c..155710585979 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -45,6 +45,25 @@
>  #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
>  /* Specify an eventfd file descriptor to signal on log write. */
>  #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +/* By default, a device gets one vhost_worker that its virtqueues share. This
> + * command allows the owner of the device to create an additional vhost_worker
> + * for the device. It can later be bound to 1 or more of its virtqueues using
> + * the VHOST_ATTACH_VRING_WORKER command.
> + *
> + * This must be called after VHOST_SET_OWNER and the caller must be the owner
> + * of the device. The new thread will inherit caller's cgroups and namespaces,
> + * and will share the caller's memory space. The new thread will also be
> + * counted against the caller's RLIMIT_NPROC value.

This makes me think if we should destroy and re-create those after
VHOST_RESET_OWNER?

Thanks

> + *
> + * The worker's ID used in other commands will be returned in
> + * vhost_worker_state.
> + */
> +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
> +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
> + * virtqueue. If userspace is not able to call this for workers its created,
> + * the kernel will free all the device's workers when the device is closed.
> + */
> +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
>
>  /* Ring setup. */
>  /* Set number of descriptors in ring. This parameter can not
> @@ -70,6 +89,20 @@
>  #define VHOST_VRING_BIG_ENDIAN 1
>  #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
>  #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
> +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
> + * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver
> + * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT,
> + * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run.
> + *
> + * This will replace the virtqueue's existing worker. If the replaced worker
> + * is no longer attached to any virtqueues, it can be freed with
> + * VHOST_FREE_WORKER.
> + */
> +#define VHOST_ATTACH_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15,            \
> +                                       struct vhost_vring_worker)
> +/* Return the vring worker's ID */
> +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16,               \
> +                                    struct vhost_vring_worker)
>
>  /* The following ioctls use eventfd file descriptors to signal and poll
>   * for events. */
> diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
> index c5690a8992d8..d3aad12ad1fa 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -47,6 +47,22 @@ struct vhost_vring_addr {
>         __u64 log_guest_addr;
>  };
>
> +struct vhost_worker_state {
> +       /*
> +        * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
> +        * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
> +        * to free.
> +        */
> +       unsigned int worker_id;
> +};
> +
> +struct vhost_vring_worker {
> +       /* vring index */
> +       unsigned int index;
> +       /* The id of the vhost_worker returned from VHOST_NEW_WORKER */
> +       unsigned int worker_id;
> +};
> +
>  /* no alignment requirement */
>  struct vhost_iotlb_msg {
>         __u64 iova;
> --
> 2.25.1
>

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization




[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux