[PATCH V5 11/12] vhost: allow userspace to create workers

Mike Christie <michael.christie@xxxxxxxxxx> · Mon, 6 Dec 2021 20:51:16 -0600

For vhost-scsi with 3 vqs and a workload like that tries to use those vqs
like:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=3

the single vhost worker thread will become a bottlneck.

To better utilize virtqueues and available CPUs, this patch allows
userspace to create workers and bind them to vqs. You can have N workers
per dev and also share N workers with M vqs.

Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
---
 drivers/vhost/vhost.c            | 164 ++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h            |   4 +-
 include/uapi/linux/vhost.h       |  22 +++++
 include/uapi/linux/vhost_types.h |  15 +++
 4 files changed, 188 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 050b8007db8b..1af43b5d1dbd 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -248,18 +248,16 @@ static void vhost_work_queue_on(struct vhost_worker *worker,
 	}
 }
 
-static void vhost_work_flush_on(struct vhost_worker *worker)
+static int vhost_workers_idr_flush_iter(int id, void *worker, void *dev)
 {
 	struct vhost_flush_struct flush;
 
-	if (!worker)
-		return;
-
 	init_completion(&flush.wait_event);
 	vhost_work_init(&flush.work, vhost_flush_work);
 
 	vhost_work_queue_on(worker, &flush.work);
 	wait_for_completion(&flush.wait_event);
+	return 0;
 }
 
 void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
@@ -268,9 +266,16 @@ void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
 }
 EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
+/**
+ * vhost_work_dev_flush - flush vhost work in device
+ * @dev: vhost dev to flush
+ *
+ * This must be called with the device mutex or from the device's release
+ * function when workers cannot be swapped.
+ */
 void vhost_work_dev_flush(struct vhost_dev *dev)
 {
-	vhost_work_flush_on(dev->worker);
+	idr_for_each(&dev->worker_idr, vhost_workers_idr_flush_iter, dev);
 }
 EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
 
@@ -485,7 +490,6 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->umem = NULL;
 	dev->iotlb = NULL;
 	dev->mm = NULL;
-	dev->worker = NULL;
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
@@ -495,6 +499,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
+	idr_init(&dev->worker_idr);
 
 
 	for (i = 0; i < dev->nvqs; ++i) {
@@ -568,31 +573,59 @@ static void vhost_worker_stop(struct vhost_worker *worker)
 	wait_for_completion(worker->exit_done);
 }
 
-static void vhost_worker_free(struct vhost_dev *dev)
+static void vhost_worker_put(struct vhost_dev *dev, struct vhost_worker *worker)
 {
-	struct vhost_worker *worker = dev->worker;
-
 	if (!worker)
 		return;
 
-	dev->worker = NULL;
+	if (!refcount_dec_and_test(&worker->refcount))
+		return;
+
 	WARN_ON(!llist_empty(&worker->work_list));
 	vhost_worker_stop(worker);
+	idr_remove(&dev->worker_idr, worker->id);
 	kfree(worker);
 }
 
+static void vhost_vq_detach_worker(struct vhost_virtqueue *vq)
+{
+	if (vq->worker)
+		vhost_worker_put(vq->dev, vq->worker);
+	vq->worker = NULL;
+}
+
+static int vhost_workers_idr_iter(int id, void *worker, void *dev)
+{
+	vhost_worker_put(dev, worker);
+	return 0;
+}
+
+static void vhost_workers_free(struct vhost_dev *dev)
+{
+	int i;
+
+	if (!dev->use_worker)
+		return;
+
+	for (i = 0; i < dev->nvqs; i++)
+		vhost_vq_detach_worker(dev->vqs[i]);
+
+	idr_for_each(&dev->worker_idr, vhost_workers_idr_iter, dev);
+}
+
 static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
 	struct vhost_worker *worker;
 	struct task_struct *task;
+	int id;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
 	if (!worker)
 		return NULL;
 
-	dev->worker = worker;
 	worker->kcov_handle = kcov_common_handle();
 	init_llist_head(&worker->work_list);
+	refcount_set(&worker->refcount, 1);
 
 	/*
 	 * vhost used to use the kthread API which ignores all signals by
@@ -605,14 +638,88 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 
 	worker->task = task;
 	user_worker_start(task, "vhost-%d", current->pid);
+
+	/* idr accesses are done under the vhost_dev mutex */
+	id = idr_alloc(&dev->worker_idr, worker, 0, INT_MAX, GFP_KERNEL);
+	if (id < 0)
+		goto stop_worker;
+	worker->id = id;
+
 	return worker;
 
+stop_worker:
+	vhost_worker_stop(worker);
 free_worker:
 	kfree(worker);
-	dev->worker = NULL;
 	return NULL;
 }
 
+/* Caller must have device mutex */
+static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
+				  struct vhost_vring_worker *info)
+{
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_worker *worker;
+
+	if (!dev->use_worker)
+		return -EINVAL;
+
+	/* We don't support setting a worker on an active vq */
+	if (vq->private_data)
+		return -EBUSY;
+
+	worker = idr_find(&dev->worker_idr, info->worker_id);
+	if (!worker)
+		return -ENODEV;
+
+	refcount_inc(&worker->refcount);
+
+	vhost_vq_detach_worker(vq);
+	vq->worker = worker;
+	return 0;
+}
+
+/* Caller must have device mutex */
+static int vhost_new_worker(struct vhost_dev *dev,
+			    struct vhost_worker_state *info)
+{
+	struct vhost_worker *worker;
+
+	if (!dev->use_worker)
+		return -EINVAL;
+
+	worker = vhost_worker_create(dev);
+	if (!worker)
+		return -ENOMEM;
+
+	info->worker_id = worker->id;
+	return 0;
+}
+
+/* Caller must have device mutex */
+static int vhost_free_worker(struct vhost_dev *dev,
+			     struct vhost_worker_state *info)
+{
+	struct vhost_worker *worker;
+
+	if (!dev->use_worker)
+		return -EINVAL;
+
+	worker = idr_find(&dev->worker_idr, info->worker_id);
+	if (!worker)
+		return -ENODEV;
+
+	/*
+	 * We can free the worker if there are no attached vqs and we only
+	 * have the refcount from the initial creation.
+	 */
+	if (refcount_read(&worker->refcount) != 1)
+		return -EBUSY;
+
+	vhost_worker_put(dev, worker);
+	return 0;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -632,8 +739,10 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 		if (!worker)
 			goto err_worker;
 
-		for (i = 0; i < dev->nvqs; i++)
+		for (i = 0; i < dev->nvqs; i++) {
+			refcount_inc(&worker->refcount);
 			dev->vqs[i]->worker = worker;
+		}
 	}
 
 	err = vhost_dev_alloc_iovecs(dev);
@@ -642,7 +751,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	return 0;
 err_iovecs:
-	vhost_worker_free(dev);
+	vhost_workers_free(dev);
 err_worker:
 	vhost_detach_mm(dev);
 err_mm:
@@ -734,7 +843,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	dev->iotlb = NULL;
 	vhost_clear_msg(dev);
 	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
-	vhost_worker_free(dev);
+	vhost_workers_free(dev);
 	vhost_detach_mm(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -1604,6 +1713,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	struct eventfd_ctx *ctx = NULL;
 	u32 __user *idxp = argp;
 	struct vhost_virtqueue *vq;
+	struct vhost_vring_worker w;
 	struct vhost_vring_state s;
 	struct vhost_vring_file f;
 	u32 idx;
@@ -1711,7 +1821,16 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 		if (copy_to_user(argp, &s, sizeof(s)))
 			r = -EFAULT;
 		break;
-	default:
+	case VHOST_ATTACH_VRING_WORKER:
+		if (copy_from_user(&w, argp, sizeof(w))) {
+			r = -EFAULT;
+			break;
+		}
+		r = vhost_vq_attach_worker(vq, &w);
+		if (!r && copy_to_user(argp, &w, sizeof(w)))
+			r = -EFAULT;
+		break;
+default:
 		r = -ENOIOCTLCMD;
 	}
 
@@ -1764,6 +1883,7 @@ EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
 /* Caller must have device mutex */
 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 {
+	struct vhost_worker_state w;
 	struct eventfd_ctx *ctx;
 	u64 p;
 	long r;
@@ -1824,6 +1944,18 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 		if (ctx)
 			eventfd_ctx_put(ctx);
 		break;
+	case VHOST_NEW_WORKER:
+		r = vhost_new_worker(d, &w);
+		if (!r && copy_to_user(argp, &w, sizeof(w)))
+			r = -EFAULT;
+		break;
+	case VHOST_FREE_WORKER:
+		if (copy_from_user(&w, argp, sizeof(w))) {
+			r = -EFAULT;
+			break;
+		}
+		r = vhost_free_worker(d, &w);
+		break;
 	default:
 		r = -ENOIOCTLCMD;
 		break;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 4423b2420c90..1738388fa02d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -35,6 +35,8 @@ struct vhost_worker {
 	struct llist_head	work_list;
 	u64			kcov_handle;
 	unsigned long		flags;
+	refcount_t		refcount;
+	int			id;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -160,7 +162,6 @@ struct vhost_dev {
 	struct vhost_virtqueue **vqs;
 	int nvqs;
 	struct eventfd_ctx *log_ctx;
-	struct vhost_worker *worker;
 	struct vhost_iotlb *umem;
 	struct vhost_iotlb *iotlb;
 	spinlock_t iotlb_lock;
@@ -170,6 +171,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	struct idr worker_idr;
 	bool use_worker;
 	int (*msg_handler)(struct vhost_dev *dev,
 			   struct vhost_iotlb_msg *msg);
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c998860d7bbc..117ea92b3925 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -45,6 +45,23 @@
 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
 /* Specify an eventfd file descriptor to signal on log write. */
 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
+/* By default, a device gets one vhost_worker that its virtqueues share. This
+ * command allows the owner of the device to create an additional vhost_worker
+ * for the device. It can later be bound to 1 or more of its virtqueues using
+ * the VHOST_ATTACH_VRING_WORKER command.
+ *
+ * This must be called after VHOST_SET_OWNER and the caller must be the owner
+ * of the device. The new thread will inherit caller's cgroups and namespaces,
+ * and will share the caller's memory space. The new thread will also be
+ * counted against the caller's RLIMIT_NPROC value.
+ */
+#define VHOST_NEW_WORKER _IOW(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
+/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
+ * virtqueue. If userspace is not able to call this for workers its created,
+ * the kernel will free all the device's workers when the device is closed and
+ * the last reference to the device has been released.
+ */
+#define VHOST_FREE_WORKER _IOR(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
 
 /* Ring setup. */
 /* Set number of descriptors in ring. This parameter can not
@@ -70,6 +87,11 @@
 #define VHOST_VRING_BIG_ENDIAN 1
 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
 #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
+/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
+ * virtqueues. This must be done before the virtqueue is active.
+ */
+#define VHOST_ATTACH_VRING_WORKER _IOR(VHOST_VIRTIO, 0x15,		\
+				       struct vhost_vring_worker)
 
 /* The following ioctls use eventfd file descriptors to signal and poll
  * for events. */
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index f7f6a3a28977..d38a023e494f 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -47,6 +47,21 @@ struct vhost_vring_addr {
 	__u64 log_guest_addr;
 };
 
+struct vhost_worker_state {
+	/*
+	 * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
+	 * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
+	 * to free.
+	 */
+	int worker_id;
+};
+
+struct vhost_vring_worker {
+	unsigned int index;
+	/* The id of the vhost_worker returned from VHOST_NEW_WORKER */
+	int worker_id;
+};
+
 /* no alignment requirement */
 struct vhost_iotlb_msg {
 	__u64 iova;
-- 
2.25.1