On Sat, Apr 29, 2023 at 12:32 AM Mike Christie <michael.christie@xxxxxxxxxx> wrote: > > For vhost-scsi with 3 vqs or more and a workload that tries to use > them in parallel like: > > fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ > --ioengine=libaio --iodepth=128 --numjobs=3 > > the single vhost worker thread will become a bottlneck and we are stuck > at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are > used. > > To better utilize virtqueues and available CPUs, this patch allows > userspace to create workers and bind them to vqs. You can have N workers > per dev and also share N workers with M vqs on that dev. > > This patch adds the interface related code and the next patch will hook > vhost-scsi into it. The patches do not try to hook net and vsock into > the interface because: > > 1. multiple workers don't seem to help vsock. The problem is that with > only 2 virtqueues we never fully use the existing worker when doing > bidirectional tests. This seems to match vhost-scsi where we don't see > the worker as a bottleneck until 3 virtqueues are used. > > 2. net already has a way to use multiple workers. > > Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx> > --- > drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++++++- > drivers/vhost/vhost.h | 3 + > include/uapi/linux/vhost.h | 33 +++++++ > include/uapi/linux/vhost_types.h | 16 ++++ > 4 files changed, 196 insertions(+), 1 deletion(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 4b0b82292379..e8f829f35814 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > return NULL; > } > > +/* Caller must have device mutex */ > +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_worker *worker) > +{ > + if (vq->worker) > + vq->worker->attachment_cnt--; > + worker->attachment_cnt++; > + vq->worker = worker; > +} > + > +/** > + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command > + * @vq: the virtqueue we will set the worker for > + * @info: the worker userspace has requested us to use > + * > + * We only allow userspace to set a virtqueue's worker if it's not active and > + * polling is not enabled. I wonder if we can mandate this in the code like check the vq backend in vhost_vq_work_queue(). We also assume drivers supporting this will not be > + * internally queueing works directly or via calls like vhost_dev_flush at > + * this time. > + * > + * Caller must have device and virtqueue mutex. > + */ > +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_vring_worker *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_dev *dev = vq->dev; > + struct vhost_worker *worker; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + if (vhost_vq_get_backend(vq) || vq->kick) It might be worthwhile to have a comment to explain why we need to check vq->kick here. This also means the device should not queue work when the backend is NULL. But I found it is probably not the case for vsock, it calls vhost_poll_queue() in vhost_transport_cancel_pkt() but vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq, NULL); Net seems to be fine since it waits for ubufs to be completed in vhost_net_set_backend(). Can we make things easier by migrating the work_list? I also worry if there are other corner cases which makes me think how hard it is if we can just support those ioctls after the backend is set? > + return -EBUSY; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + __vhost_vq_attach_worker(vq, worker); > + return 0; > +} > + > +/* Caller must have device mutex */ > +static int vhost_new_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + struct vhost_worker *worker; > + > + worker = vhost_worker_create(dev); > + if (!worker) > + return -ENOMEM; > + > + info->worker_id = worker->id; > + return 0; > +} > + > +static int vhost_free_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_worker *worker; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + if (worker->attachment_cnt) > + return -EBUSY; > + > + vhost_worker_destroy(dev, worker); > + return 0; > +} > + > static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, > struct vhost_virtqueue **vq, u32 *id) > { > @@ -651,6 +725,75 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, > return 0; > } > > +/* Caller must have device mutex */ > +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, > + void __user *argp) > +{ > + struct vhost_vring_worker ring_worker; > + struct vhost_worker_state state; > + struct vhost_virtqueue *vq; > + long ret; > + u32 idx; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + if (!vhost_dev_has_owner(dev)) > + return -EINVAL; > + > + switch (ioctl) { > + /* dev worker ioctls */ > + case VHOST_NEW_WORKER: > + ret = vhost_new_worker(dev, &state); > + if (!ret && copy_to_user(argp, &state, sizeof(state))) > + ret = -EFAULT; > + return ret; > + case VHOST_FREE_WORKER: > + if (copy_from_user(&state, argp, sizeof(state))) > + return -EFAULT; > + return vhost_free_worker(dev, &state); > + /* vring worker ioctls */ > + case VHOST_ATTACH_VRING_WORKER: > + case VHOST_GET_VRING_WORKER: > + break; > + default: > + return -ENOIOCTLCMD; > + } > + > + ret = vhost_get_vq_from_user(dev, argp, &vq, &idx); > + if (ret) > + return ret; > + > + mutex_lock(&vq->mutex); > + switch (ioctl) { > + case VHOST_ATTACH_VRING_WORKER: > + if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) { > + ret = -EFAULT; > + break; > + } > + > + ret = vhost_vq_attach_worker(vq, &ring_worker); > + if (!ret && copy_to_user(argp, &ring_worker, > + sizeof(ring_worker))) > + ret = -EFAULT; > + break; > + case VHOST_GET_VRING_WORKER: > + ring_worker.index = idx; > + ring_worker.worker_id = vq->worker->id; > + > + if (copy_to_user(argp, &ring_worker, sizeof(ring_worker))) > + ret = -EFAULT; > + break; > + default: > + ret = -ENOIOCTLCMD; > + break; > + } > + > + mutex_unlock(&vq->mutex); > + return ret; > +} > +EXPORT_SYMBOL_GPL(vhost_worker_ioctl); > + > /* Caller should have device mutex */ > long vhost_dev_set_owner(struct vhost_dev *dev) > { > @@ -671,7 +814,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) > goto err_worker; > > for (i = 0; i < dev->nvqs; i++) > - dev->vqs[i]->worker = worker; > + __vhost_vq_attach_worker(dev->vqs[i], worker); > } > > err = vhost_dev_alloc_iovecs(dev); > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 2eea20d54134..bcb33a2f64f0 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -31,6 +31,7 @@ struct vhost_worker { > struct llist_head work_list; > u64 kcov_handle; > u32 id; > + int attachment_cnt; > }; > > /* Poll a file (eventfd or socket) */ > @@ -187,6 +188,8 @@ void vhost_dev_cleanup(struct vhost_dev *); > void vhost_dev_stop(struct vhost_dev *); > long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); > long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); > +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, > + void __user *argp); > bool vhost_vq_access_ok(struct vhost_virtqueue *vq); > bool vhost_log_access_ok(struct vhost_dev *); > void vhost_clear_msg(struct vhost_dev *dev); > diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h > index 92e1b700b51c..155710585979 100644 > --- a/include/uapi/linux/vhost.h > +++ b/include/uapi/linux/vhost.h > @@ -45,6 +45,25 @@ > #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) > /* Specify an eventfd file descriptor to signal on log write. */ > #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) > +/* By default, a device gets one vhost_worker that its virtqueues share. This > + * command allows the owner of the device to create an additional vhost_worker > + * for the device. It can later be bound to 1 or more of its virtqueues using > + * the VHOST_ATTACH_VRING_WORKER command. > + * > + * This must be called after VHOST_SET_OWNER and the caller must be the owner > + * of the device. The new thread will inherit caller's cgroups and namespaces, > + * and will share the caller's memory space. The new thread will also be > + * counted against the caller's RLIMIT_NPROC value. This makes me think if we should destroy and re-create those after VHOST_RESET_OWNER? Thanks > + * > + * The worker's ID used in other commands will be returned in > + * vhost_worker_state. > + */ > +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) > +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any > + * virtqueue. If userspace is not able to call this for workers its created, > + * the kernel will free all the device's workers when the device is closed. > + */ > +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) > > /* Ring setup. */ > /* Set number of descriptors in ring. This parameter can not > @@ -70,6 +89,20 @@ > #define VHOST_VRING_BIG_ENDIAN 1 > #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) > #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's > + * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver > + * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT, > + * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run. > + * > + * This will replace the virtqueue's existing worker. If the replaced worker > + * is no longer attached to any virtqueues, it can be freed with > + * VHOST_FREE_WORKER. > + */ > +#define VHOST_ATTACH_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15, \ > + struct vhost_vring_worker) > +/* Return the vring worker's ID */ > +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ > + struct vhost_vring_worker) > > /* The following ioctls use eventfd file descriptors to signal and poll > * for events. */ > diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h > index c5690a8992d8..d3aad12ad1fa 100644 > --- a/include/uapi/linux/vhost_types.h > +++ b/include/uapi/linux/vhost_types.h > @@ -47,6 +47,22 @@ struct vhost_vring_addr { > __u64 log_guest_addr; > }; > > +struct vhost_worker_state { > + /* > + * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id. > + * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker > + * to free. > + */ > + unsigned int worker_id; > +}; > + > +struct vhost_vring_worker { > + /* vring index */ > + unsigned int index; > + /* The id of the vhost_worker returned from VHOST_NEW_WORKER */ > + unsigned int worker_id; > +}; > + > /* no alignment requirement */ > struct vhost_iotlb_msg { > __u64 iova; > -- > 2.25.1 > _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization