On Tue, Mar 28, 2023 at 10:17 AM Mike Christie <michael.christie@xxxxxxxxxx> wrote: > > For vhost-scsi with 3 vqs and a workload like that tries to use those vqs > like: > > fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ > --ioengine=libaio --iodepth=128 --numjobs=3 > > the single vhost worker thread will become a bottlneck and we are stuck > at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are > used. > > To better utilize virtqueues and available CPUs, this patch allows > userspace to create workers and bind them to vqs. You can have N workers > per dev and also share N workers with M vqs. > > With the patches and doing a worker per vq, we can scale to at least > 16 vCPUs/vqs (that's my system limit) with the same command fio command > above with numjobs=16: > > fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ > --ioengine=libaio --iodepth=64 --numjobs=16 > > which gives around 2326K IOPs. > > Note that for testing I dropped depth to 64 above because the vhost/virt > layer supports only 1024 total commands per device. And the only tuning I > did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO > path which becomes an issue at around 12 jobs/virtqueues. > > Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx> > --- > drivers/vhost/vhost.c | 177 ++++++++++++++++++++++++++++--- > drivers/vhost/vhost.h | 4 +- > include/uapi/linux/vhost.h | 22 ++++ > include/uapi/linux/vhost_types.h | 15 +++ > 4 files changed, 204 insertions(+), 14 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 1fa5e9a49092..e40699e83c6d 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -271,7 +271,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue); > > void vhost_dev_flush(struct vhost_dev *dev) > { > - vhost_work_flush_on(dev->worker); > + struct vhost_worker *worker; > + unsigned long i; > + > + xa_for_each(&dev->worker_xa, i, worker) > + vhost_work_flush_on(worker); > } > EXPORT_SYMBOL_GPL(vhost_dev_flush); > > @@ -489,7 +493,6 @@ void vhost_dev_init(struct vhost_dev *dev, > dev->umem = NULL; > dev->iotlb = NULL; > dev->mm = NULL; > - dev->worker = NULL; > dev->iov_limit = iov_limit; > dev->weight = weight; > dev->byte_weight = byte_weight; > @@ -499,7 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev, > INIT_LIST_HEAD(&dev->read_list); > INIT_LIST_HEAD(&dev->pending_list); > spin_lock_init(&dev->iotlb_lock); > - > + xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC); > > for (i = 0; i < dev->nvqs; ++i) { > vq = dev->vqs[i]; > @@ -562,32 +565,67 @@ static void vhost_detach_mm(struct vhost_dev *dev) > dev->mm = NULL; > } > > -static void vhost_worker_free(struct vhost_dev *dev) > +static void vhost_worker_put(struct vhost_dev *dev, struct vhost_worker *worker) > { > - struct vhost_worker *worker = dev->worker; > - > if (!worker) > return; > > - dev->worker = NULL; > + if (!refcount_dec_and_test(&worker->refcount)) > + return; > + > WARN_ON(!llist_empty(&worker->work_list)); > vhost_task_stop(worker->vtsk); > kfree(worker); > } > > +static void vhost_vq_detach_worker(struct vhost_virtqueue *vq) > +{ > + if (vq->worker) What happens to the pending work that queues for the old worker? > + vhost_worker_put(vq->dev, vq->worker); > + vq->worker = NULL; > +} > + > +static void vhost_workers_free(struct vhost_dev *dev) > +{ > + struct vhost_worker *worker; > + unsigned long i; > + > + if (!dev->use_worker) > + return; > + > + for (i = 0; i < dev->nvqs; i++) > + vhost_vq_detach_worker(dev->vqs[i]); > + /* > + * Drop the refcount taken during allocation, and handle the default > + * worker and the cases where userspace might have crashed or was lazy > + * and did a VHOST_NEW_WORKER but not a VHOST_FREE_WORKER. > + */ > + xa_for_each(&dev->worker_xa, i, worker) { > + xa_erase(&dev->worker_xa, worker->id); > + vhost_worker_put(dev, worker); > + } > + xa_destroy(&dev->worker_xa); > +} > + > static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > { > struct vhost_worker *worker; > struct vhost_task *vtsk; > char name[TASK_COMM_LEN]; > + int ret; > + u32 id; > > worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); > if (!worker) > return NULL; > > - dev->worker = worker; > worker->kcov_handle = kcov_common_handle(); > init_llist_head(&worker->work_list); > + /* > + * We increase the refcount for the initial creation and then > + * later each time it's attached to a virtqueue. > + */ > + refcount_set(&worker->refcount, 1); > snprintf(name, sizeof(name), "vhost-%d", current->pid); > > vtsk = vhost_task_create(vhost_worker, worker, name); > @@ -596,14 +634,104 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > > worker->vtsk = vtsk; > vhost_task_start(vtsk); > + > + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); > + if (ret < 0) > + goto stop_worker; > + worker->id = id; > + > return worker; > > +stop_worker: > + vhost_task_stop(vtsk); > free_worker: > kfree(worker); > - dev->worker = NULL; > return NULL; > } > > +/* Caller must have device and virtqueue mutex */ > +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_worker *worker) > +{ > + refcount_inc(&worker->refcount); > + vhost_vq_detach_worker(vq);()) > + vq->worker = worker; What happens if there's a pending flush in a specific device (e.g in vhost_scsi_tmf_resp_work())? Does this mean we need to hold vq mutex when doing the flush? (which seems not the case of vhost_scsi_tmf_resp_work()). > +} > + > +/* Caller must have device and virtqueue mutex */ > +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_vring_worker *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_dev *dev = vq->dev; > + struct vhost_worker *worker; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + /* > + * We don't support setting a worker on an active vq to make flushing > + * and removal simple. > + */ > + if (vhost_vq_get_backend(vq)) > + return -EBUSY; This assumes the worker won't be started until the backend is set which is not true. > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + __vhost_vq_attach_worker(vq, worker); > + return 0; > +} > + > +/* Caller must have device mutex */ > +static int vhost_new_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + struct vhost_worker *worker; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + worker = vhost_worker_create(dev); > + if (!worker) > + return -ENOMEM; > + > + info->worker_id = worker->id; > + return 0; > +} > + > +/* Caller must have device mutex */ > +static int vhost_free_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_worker *worker; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); So we use int for worker_id which conflicts with UINT_MAX here? struct vhost_worker_state { /* * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id. * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker * to free. */ int worker_id; }; A side effect of using xa_index directly is that userspace can guess the xa_index of the default worker and free it here, is this intended? Should we hide the default worker from xa? > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + /* > + * We can free the worker if it's not attached to any virtqueues. > + */ > + if (refcount_read(&worker->refcount) != 1) > + return -EBUSY; > + > + xa_erase(&dev->worker_xa, worker->id); > + /* > + * Make sure if there was a flush that saw the worker in the XA that > + * it has completed. > + */ > + vhost_work_flush_on(worker); > + > + vhost_worker_put(dev, worker); > + return 0; > +} > + > /* Caller should have device mutex */ > long vhost_dev_set_owner(struct vhost_dev *dev) > { > @@ -624,7 +752,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) > goto err_worker; > > for (i = 0; i < dev->nvqs; i++) > - dev->vqs[i]->worker = worker; > + __vhost_vq_attach_worker(dev->vqs[i], worker); > } > > err = vhost_dev_alloc_iovecs(dev); > @@ -633,7 +761,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) > > return 0; > err_iovecs: > - vhost_worker_free(dev); > + vhost_workers_free(dev); > err_worker: > vhost_detach_mm(dev); > err_mm: > @@ -726,7 +854,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) > dev->iotlb = NULL; > vhost_clear_msg(dev); > wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); > - vhost_worker_free(dev); > + vhost_workers_free(dev); > vhost_detach_mm(dev); > } > EXPORT_SYMBOL_GPL(vhost_dev_cleanup); > @@ -1616,6 +1744,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > struct eventfd_ctx *ctx = NULL; > u32 __user *idxp = argp; > struct vhost_virtqueue *vq; > + struct vhost_vring_worker w; > struct vhost_vring_state s; > struct vhost_vring_file f; > u32 idx; > @@ -1723,7 +1852,16 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > if (copy_to_user(argp, &s, sizeof(s))) > r = -EFAULT; > break; > - default: > + case VHOST_ATTACH_VRING_WORKER: > + if (copy_from_user(&w, argp, sizeof(w))) { > + r = -EFAULT; > + break; > + } > + r = vhost_vq_attach_worker(vq, &w); > + if (!r && copy_to_user(argp, &w, sizeof(w))) > + r = -EFAULT; > + break; It's a little odd that we have new and attach but only a free. Thanks > +default: > r = -ENOIOCTLCMD; > } > > @@ -1776,6 +1914,7 @@ EXPORT_SYMBOL_GPL(vhost_init_device_iotlb); > /* Caller must have device mutex */ > long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) > { > + struct vhost_worker_state w; > struct eventfd_ctx *ctx; > u64 p; > long r; > @@ -1836,6 +1975,18 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) > if (ctx) > eventfd_ctx_put(ctx); > break; > + case VHOST_NEW_WORKER: > + r = vhost_new_worker(d, &w); > + if (!r && copy_to_user(argp, &w, sizeof(w))) > + r = -EFAULT; > + break; > + case VHOST_FREE_WORKER: > + if (copy_from_user(&w, argp, sizeof(w))) { > + r = -EFAULT; > + break; > + } > + r = vhost_free_worker(d, &w); > + break; > default: > r = -ENOIOCTLCMD; > break; > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 395707c680e5..a67ae8293c38 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -30,6 +30,8 @@ struct vhost_worker { > struct vhost_task *vtsk; > struct llist_head work_list; > u64 kcov_handle; > + refcount_t refcount; > + u32 id; > }; > > /* Poll a file (eventfd or socket) */ > @@ -156,7 +158,6 @@ struct vhost_dev { > struct vhost_virtqueue **vqs; > int nvqs; > struct eventfd_ctx *log_ctx; > - struct vhost_worker *worker; > struct vhost_iotlb *umem; > struct vhost_iotlb *iotlb; > spinlock_t iotlb_lock; > @@ -166,6 +167,7 @@ struct vhost_dev { > int iov_limit; > int weight; > int byte_weight; > + struct xarray worker_xa; > bool use_worker; > int (*msg_handler)(struct vhost_dev *dev, u32 asid, > struct vhost_iotlb_msg *msg); > diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h > index 92e1b700b51c..7329e7f349dd 100644 > --- a/include/uapi/linux/vhost.h > +++ b/include/uapi/linux/vhost.h > @@ -45,6 +45,23 @@ > #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) > /* Specify an eventfd file descriptor to signal on log write. */ > #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) > +/* By default, a device gets one vhost_worker that its virtqueues share. This > + * command allows the owner of the device to create an additional vhost_worker > + * for the device. It can later be bound to 1 or more of its virtqueues using > + * the VHOST_ATTACH_VRING_WORKER command. > + * > + * This must be called after VHOST_SET_OWNER and the caller must be the owner > + * of the device. The new thread will inherit caller's cgroups and namespaces, > + * and will share the caller's memory space. The new thread will also be > + * counted against the caller's RLIMIT_NPROC value. > + */ > +#define VHOST_NEW_WORKER _IOW(VHOST_VIRTIO, 0x8, struct vhost_worker_state) > +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any > + * virtqueue. If userspace is not able to call this for workers its created, > + * the kernel will free all the device's workers when the device is closed and > + * the last reference to the device has been released. > + */ > +#define VHOST_FREE_WORKER _IOR(VHOST_VIRTIO, 0x9, struct vhost_worker_state) > > /* Ring setup. */ > /* Set number of descriptors in ring. This parameter can not > @@ -70,6 +87,11 @@ > #define VHOST_VRING_BIG_ENDIAN 1 > #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) > #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's > + * virtqueues. This must be done before the virtqueue is active. > + */ > +#define VHOST_ATTACH_VRING_WORKER _IOR(VHOST_VIRTIO, 0x15, \ > + struct vhost_vring_worker) > > /* The following ioctls use eventfd file descriptors to signal and poll > * for events. */ > diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h > index c5690a8992d8..ad0fe2e721be 100644 > --- a/include/uapi/linux/vhost_types.h > +++ b/include/uapi/linux/vhost_types.h > @@ -47,6 +47,21 @@ struct vhost_vring_addr { > __u64 log_guest_addr; > }; > > +struct vhost_worker_state { > + /* > + * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id. > + * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker > + * to free. > + */ > + int worker_id; > +}; > + > +struct vhost_vring_worker { > + unsigned int index; > + /* The id of the vhost_worker returned from VHOST_NEW_WORKER */ > + int worker_id; > +}; > + > /* no alignment requirement */ > struct vhost_iotlb_msg { > __u64 iova; > -- > 2.25.1 > _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization