[PATCH 1/1] Add vhost_blk driver

Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx> · Fri, 2 Nov 2018 18:21:23 +0000

This driver accelerates host side of virtio-blk.

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx>
---
 drivers/vhost/Kconfig  |  13 ++
 drivers/vhost/Makefile |   3 +
 drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 526 insertions(+)
 create mode 100644 drivers/vhost/blk.c

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index b580885243f7..c4980d6af0ea 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
 	  adds some overhead, it is disabled by default.
 
 	  If unsure, say "N".
+
+config VHOST_BLK
+	tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
+	depends on BLOCK && EVENTFD
+	select VHOST
+	default n
+	help
+	 This kernel module can be loaded in host kernel to accelerate
+	 guest block with virtio_blk. Not to be confused with virtio_blk
+	 module itself which needs to be loaded in guest kernel.
+
+	 To compile this driver as a module, choose M here: the module will
+	 be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6c6df24f770c..c8be36cd9214 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
 vhost_vsock-y := vsock.o
 
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
+
 obj-$(CONFIG_VHOST_RING) += vringh.o
 
 obj-$(CONFIG_VHOST)	+= vhost.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000000000000..aefb9a61fa0f
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 IBM Corporation
+ * Author: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/virtio_blk.h>
+#include <linux/vhost.h>
+#include <linux/fs.h>
+#include "vhost.h"
+
+enum {
+	VHOST_BLK_FEATURES =
+	VHOST_FEATURES |
+	(1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+	(1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+	(1ULL << VIRTIO_RING_F_EVENT_IDX) |
+	(1ULL << VIRTIO_BLK_F_MQ)
+};
+
+#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
+
+enum {
+	VHOST_BLK_VQ_MAX = 16,
+	VHOST_BLK_VQ_MAX_REQS = 128,
+};
+
+struct vhost_blk_req {
+	struct llist_node list;
+	int index;
+	struct vhost_blk_queue *q;
+	struct virtio_blk_outhdr hdr;
+	struct iovec *out_iov;
+	struct iovec *in_iov;
+	u8 out_num;
+	u8 in_num;
+	long len;
+	struct kiocb iocb;
+	struct iov_iter i;
+	int res;
+	void __user *status;
+};
+
+struct vhost_blk_queue {
+	int index;
+	struct vhost_blk *blk;
+	struct vhost_virtqueue vq;
+	struct vhost_work w;
+	struct llist_head wl;
+	struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
+};
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct file *backend;
+	int num_queues;
+	struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
+};
+
+static void vhost_blk_flush(struct vhost_blk *blk)
+{
+	int i;
+
+	for (i = 0; i < blk->num_queues; i++)
+		vhost_poll_flush(&blk->queue[i].vq.poll);
+}
+
+
+static void vhost_blk_stop(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	for (i = 0; i < blk->num_queues; i++) {
+		vq = &blk->queue[i].vq;
+		mutex_lock(&vq->mutex);
+		rcu_assign_pointer(vq->private_data, NULL);
+		mutex_unlock(&vq->mutex);
+	}
+}
+
+static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status)
+{
+	int ret;
+	int len = req->len;
+
+	pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
+		 req->q->index, req->index, status, len);
+	ret = put_user(status, (unsigned char __user *)req->status);
+
+	WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
+	     req->q->index, req->index);
+
+	vhost_add_used(&req->q->vq, req->index, len);
+
+	return ret;
+}
+
+static void vhost_blk_io_done_work(struct vhost_work *w)
+{
+	struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
+	struct llist_node *node;
+	struct vhost_blk_req *req, *tmp;
+
+	node = llist_del_all(&q->wl);
+	llist_for_each_entry_safe(req, tmp, node, list) {
+		vhost_blk_req_done(req, req->res);
+	}
+	vhost_signal(&q->blk->dev, &q->vq);
+}
+
+static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
+						 iocb);
+
+	pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
+		 req->q->index, req->index, ret, ret2);
+
+	req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+	llist_add(&req->list, &req->q->wl);
+	vhost_vq_work_queue(&req->q->vq, &req->q->w);
+}
+
+static int vhost_blk_req_handle(struct vhost_blk_req *req)
+{
+	struct vhost_blk *blk = req->q->blk;
+	struct vhost_virtqueue *vq = &req->q->vq;
+	int type = le32_to_cpu(req->hdr.type);
+	int ret;
+	u8 status;
+
+	if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
+		bool write = (type == VIRTIO_BLK_T_OUT);
+		int nr_seg = (write ? req->out_num : req->in_num) - 1;
+		unsigned long sector = le64_to_cpu(req->hdr.sector);
+		ssize_t len, rem_len;
+
+		if (!req->q->blk->backend) {
+			vq_err(vq, "blk %p no backend!\n", req->q->blk);
+			ret = -EINVAL;
+			goto out_err;
+		}
+
+		len = iov_length(&vq->iov[1], nr_seg);
+		pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
+			 __func__, current->pid, current->comm,
+			 write ? "WRITE" : "READ", req->hdr.sector, len);
+
+		req->len = len;
+		rem_len = len;
+		iov_iter_init(&req->i, (write ? WRITE : READ),
+			      write ? &req->out_iov[0] : &req->in_iov[0],
+			      nr_seg, len);
+
+		req->iocb.ki_pos = sector << 9;
+		req->iocb.ki_filp = blk->backend;
+		req->iocb.ki_complete = vhost_blk_iocb_complete;
+		req->iocb.ki_flags = IOCB_DIRECT;
+
+		if (write)
+			ret = call_write_iter(blk->backend, &req->iocb,
+					      &req->i);
+		else
+			ret = call_read_iter(blk->backend, &req->iocb,
+					     &req->i);
+
+		if (ret != -EIOCBQUEUED)
+			vhost_blk_iocb_complete(&req->iocb, ret, 0);
+
+		ret = 0;
+		goto out;
+	}
+
+	if (type == VIRTIO_BLK_T_GET_ID) {
+		char s[] = "vhost_blk";
+		size_t len = min_t(size_t, req->in_iov[0].iov_len,
+				   strlen(s));
+
+		ret = copy_to_user(req->in_iov[0].iov_base, s, len);
+		status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+		if (put_user(status, (unsigned char __user *)req->status)) {
+			ret = -EFAULT;
+			goto out_err;
+		}
+		vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
+		ret = 0;
+		goto out;
+	} else {
+		pr_warn("Unsupported request type %d\n", type);
+		vhost_discard_vq_desc(vq, 1);
+		ret = -EINVAL;
+		return ret;
+	}
+out_err:
+	vhost_discard_vq_desc(vq, 1);
+out:
+	return ret;
+}
+
+static void vhost_blk_handle_guest_kick(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk_queue *q;
+	struct vhost_blk *blk;
+	struct vhost_blk_req *req;
+	int in, out;
+	int head;
+
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	q = container_of(vq, struct vhost_blk_queue, vq);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+
+	vhost_disable_notify(&blk->dev, vq);
+	for (;;) {
+		in = out = -1;
+
+		head = vhost_get_vq_desc(vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+
+		if (head < 0)
+			break;
+
+		if (head == vq->num) {
+			if (vhost_enable_notify(&blk->dev, vq)) {
+				vhost_disable_notify(&blk->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		req = &q->req[head];
+		req->index = head;
+		req->out_num = out;
+		req->in_num = in;
+		req->out_iov = &vq->iov[1];
+		req->in_iov = &vq->iov[out];
+		req->status = vq->iov[out + in - 1].iov_base;
+
+		if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
+				   sizeof(req->hdr))) {
+			vq_err(vq, "Failed to get block header!\n");
+			vhost_discard_vq_desc(vq, 1);
+			continue;
+		}
+		if (vhost_blk_req_handle(req) < 0)
+			break;
+	}
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *file)
+{
+	struct vhost_blk *blk;
+	struct vhost_blk_queue *q;
+	int i, j;
+
+	blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
+	if (!blk)
+		return -ENOMEM;
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		q = &blk->queue[i];
+		q->index = i;
+		q->blk = blk;
+		q->vq.handle_kick = vhost_blk_handle_guest_kick;
+		vhost_work_init(&q->w, vhost_blk_io_done_work);
+		blk->vqs[i] = &q->vq;
+		for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
+			q->req[j].index = j;
+			q->req[j].q = q;
+		}
+	}
+	vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
+		       VHOST_BLK_VQ_MAX);
+	file->private_data = blk;
+
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *blk = f->private_data;
+
+	vhost_blk_stop(blk);
+	mutex_lock(&blk->dev.mutex);
+	vhost_blk_flush(blk);
+	vhost_dev_stop(&blk->dev);
+	vhost_dev_cleanup(&blk->dev);
+	vhost_blk_flush(blk);
+
+	if (blk->backend) {
+		fput(blk->backend);
+		blk->backend = NULL;
+	}
+
+	mutex_unlock(&blk->dev.mutex);
+	kvfree(blk);
+
+	return 0;
+}
+
+static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
+{
+	int i;
+	int ret = -EFAULT;
+
+	mutex_lock(&blk->dev.mutex);
+	if ((features & (1 << VHOST_F_LOG_ALL)) &&
+	    !vhost_log_access_ok(&blk->dev))
+		goto out_unlock;
+
+	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+		if (vhost_init_device_iotlb(&blk->dev, true))
+			goto out_unlock;
+	}
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
+		struct vhost_virtqueue *vq = blk->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vq->acked_features = features & VHOST_BLK_FEATURES;
+		mutex_unlock(&vq->mutex);
+	}
+	ret = 0;
+out_unlock:
+	mutex_unlock(&blk->dev.mutex);
+
+	return ret;
+}
+
+static long vhost_blk_reset_owner(struct vhost_blk *blk)
+{
+	long err;
+	struct vhost_umem *umem;
+
+	mutex_lock(&blk->dev.mutex);
+	err = vhost_dev_check_owner(&blk->dev);
+	if (err)
+		goto done;
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
+		err = -ENOMEM;
+		goto done;
+	}
+	vhost_blk_stop(blk);
+	vhost_blk_flush(blk);
+	vhost_dev_reset_owner(&blk->dev, umem);
+done:
+	mutex_unlock(&blk->dev.mutex);
+	return err;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
+{
+	struct file *backend;
+	int ret, i;
+	struct vhost_virtqueue *vq;
+
+	mutex_lock(&blk->dev.mutex);
+	ret = vhost_dev_check_owner(&blk->dev);
+	if (ret)
+		goto out_dev;
+
+	backend = fget(fd);
+	if (IS_ERR(backend)) {
+		ret = PTR_ERR(backend);
+		goto out_dev;
+	}
+
+	if (backend == blk->backend) {
+		ret = 0;
+		goto out_file;
+	}
+
+	if (blk->backend)
+		fput(blk->backend);
+	blk->backend = backend;
+	for (i = 0; i < blk->num_queues; i++) {
+		vq = &blk->queue[i].vq;
+		if (!vhost_vq_access_ok(vq)) {
+			ret = -EFAULT;
+			goto out_file;
+		}
+		mutex_lock(&vq->mutex);
+		rcu_assign_pointer(vq->private_data, backend);
+		ret = vhost_vq_init_access(vq);
+		mutex_unlock(&vq->mutex);
+		if (ret) {
+			pr_err("vhost_vq_init_access failed: %d\n", ret);
+			goto out_file;
+		}
+
+	}
+	ret = 0;
+	goto out_dev;
+out_file:
+	fput(backend);
+	blk->backend = NULL;
+out_dev:
+	mutex_unlock(&blk->dev.mutex);
+	vhost_blk_flush(blk);
+	return ret;
+}
+
+static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
+				 void __user *argp)
+{
+	long ret;
+
+	mutex_lock(&blk->dev.mutex);
+	ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
+	if (ret == -ENOIOCTLCMD)
+		ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
+	else
+		vhost_blk_flush(blk);
+	mutex_unlock(&blk->dev.mutex);
+	return ret;
+}
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+			    unsigned long arg)
+{
+	struct vhost_blk *blk = f->private_data;
+	void __user *argp = (void __user *)arg;
+	int fd;
+	u64 __user *featurep = argp;
+	u64 features;
+	long ret;
+	struct vhost_vring_state s;
+
+	switch (ioctl) {
+	case VHOST_SET_MEM_TABLE:
+		vhost_blk_stop(blk);
+		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+		break;
+	case VHOST_SET_VRING_NUM:
+		if (copy_from_user(&s, argp, sizeof(s)))
+			return -EFAULT;
+		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+		if (!ret)
+			blk->num_queues = s.index + 1;
+		break;
+	case VHOST_BLK_SET_BACKEND:
+		if (copy_from_user(&fd, argp, sizeof(fd)))
+			return -EFAULT;
+		ret = vhost_blk_set_backend(blk, fd);
+		break;
+	case VHOST_GET_FEATURES:
+		features = VHOST_BLK_FEATURES;
+		if (copy_to_user(featurep, &features, sizeof(features)))
+			return -EFAULT;
+		ret = 0;
+		break;
+	case VHOST_SET_FEATURES:
+		if (copy_from_user(&features, featurep, sizeof(features)))
+			return -EFAULT;
+		if (features & ~VHOST_BLK_FEATURES)
+			return -EOPNOTSUPP;
+		ret = vhost_blk_set_features(blk, features);
+		break;
+	case VHOST_RESET_OWNER:
+		ret = vhost_blk_reset_owner(blk);
+		break;
+	default:
+		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+		break;
+	}
+	return ret;
+}
+
+static const struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.open           = vhost_blk_open,
+	.release        = vhost_blk_release,
+	.llseek		= noop_llseek,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	MISC_DYNAMIC_MINOR,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+	return misc_register(&vhost_blk_misc);
+}
+module_init(vhost_blk_init);
+
+static void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+}
+
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vitaly Mayatskikh");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
+MODULE_ALIAS("devname:vhost-blk");
-- 
2.17.1