Re: [PATCH 1/1] Add vhost_blk driver

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Fri, 2 Nov 2018 14:36:46 -0400

On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh wrote:
> This driver accelerates host side of virtio-blk.
> 
> Signed-off-by: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx>
> ---
>  drivers/vhost/Kconfig  |  13 ++
>  drivers/vhost/Makefile |   3 +
>  drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 526 insertions(+)
>  create mode 100644 drivers/vhost/blk.c
> 
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index b580885243f7..c4980d6af0ea 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
>  	  adds some overhead, it is disabled by default.
>  
>  	  If unsure, say "N".
> +
> +config VHOST_BLK
> +	tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
> +	depends on BLOCK && EVENTFD
> +	select VHOST
> +	default n
> +	help
> +	 This kernel module can be loaded in host kernel to accelerate
> +	 guest block with virtio_blk. Not to be confused with virtio_blk
> +	 module itself which needs to be loaded in guest kernel.
> +
> +	 To compile this driver as a module, choose M here: the module will
> +	 be called vhost_blk.
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6c6df24f770c..c8be36cd9214 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
>  obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
>  vhost_vsock-y := vsock.o
>  
> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> +vhost_blk-y := blk.o
> +
>  obj-$(CONFIG_VHOST_RING) += vringh.o
>  
>  obj-$(CONFIG_VHOST)	+= vhost.o
> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> new file mode 100644
> index 000000000000..aefb9a61fa0f
> --- /dev/null
> +++ b/drivers/vhost/blk.c
> @@ -0,0 +1,510 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 IBM Corporation
> + * Author: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + *
> + * virtio-blk server in host kernel.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/virtio_blk.h>
> +#include <linux/vhost.h>
> +#include <linux/fs.h>
> +#include "vhost.h"
> +
> +enum {
> +	VHOST_BLK_FEATURES =
> +	VHOST_FEATURES |
> +	(1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> +	(1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> +	(1ULL << VIRTIO_RING_F_EVENT_IDX) |
> +	(1ULL << VIRTIO_BLK_F_MQ)
> +};
> +
> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
> +
> +enum {
> +	VHOST_BLK_VQ_MAX = 16,
> +	VHOST_BLK_VQ_MAX_REQS = 128,
> +};
> +
> +struct vhost_blk_req {
> +	struct llist_node list;
> +	int index;
> +	struct vhost_blk_queue *q;
> +	struct virtio_blk_outhdr hdr;
> +	struct iovec *out_iov;
> +	struct iovec *in_iov;
> +	u8 out_num;
> +	u8 in_num;
> +	long len;
> +	struct kiocb iocb;
> +	struct iov_iter i;
> +	int res;
> +	void __user *status;
> +};
> +
> +struct vhost_blk_queue {
> +	int index;
> +	struct vhost_blk *blk;
> +	struct vhost_virtqueue vq;
> +	struct vhost_work w;
> +	struct llist_head wl;
> +	struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
> +};
> +
> +struct vhost_blk {
> +	struct vhost_dev dev;
> +	struct file *backend;
> +	int num_queues;
> +	struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
> +	struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
> +};
> +
> +static void vhost_blk_flush(struct vhost_blk *blk)
> +{
> +	int i;
> +
> +	for (i = 0; i < blk->num_queues; i++)
> +		vhost_poll_flush(&blk->queue[i].vq.poll);
> +}
> +
> +
> +static void vhost_blk_stop(struct vhost_blk *blk)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i;
> +
> +	for (i = 0; i < blk->num_queues; i++) {
> +		vq = &blk->queue[i].vq;
> +		mutex_lock(&vq->mutex);
> +		rcu_assign_pointer(vq->private_data, NULL);
> +		mutex_unlock(&vq->mutex);
> +	}
> +}
> +
> +static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status)
> +{
> +	int ret;
> +	int len = req->len;
> +
> +	pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
> +		 req->q->index, req->index, status, len);
> +	ret = put_user(status, (unsigned char __user *)req->status);

I'd make it u8 and not unsigned char. Also why don't you change
req->status type so you don't need a cast?

> +
> +	WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
> +	     req->q->index, req->index);

kernel warnings and debug messages that are guest-triggerable lead to
disk full errors on the host. applies elsewhere too. you want traces
instead.

> +
> +	vhost_add_used(&req->q->vq, req->index, len);

this can fail too.

> +
> +	return ret;
> +}
> +
> +static void vhost_blk_io_done_work(struct vhost_work *w)
> +{
> +	struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
> +	struct llist_node *node;
> +	struct vhost_blk_req *req, *tmp;
> +
> +	node = llist_del_all(&q->wl);
> +	llist_for_each_entry_safe(req, tmp, node, list) {
> +		vhost_blk_req_done(req, req->res);
> +	}
> +	vhost_signal(&q->blk->dev, &q->vq);
> +}
> +
> +static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
> +{
> +	struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
> +						 iocb);
> +
> +	pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
> +		 req->q->index, req->index, ret, ret2);
> +
> +	req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
> +	llist_add(&req->list, &req->q->wl);
> +	vhost_vq_work_queue(&req->q->vq, &req->q->w);
> +}
> +
> +static int vhost_blk_req_handle(struct vhost_blk_req *req)
> +{
> +	struct vhost_blk *blk = req->q->blk;
> +	struct vhost_virtqueue *vq = &req->q->vq;
> +	int type = le32_to_cpu(req->hdr.type);
> +	int ret;
> +	u8 status;
> +
> +	if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> +		bool write = (type == VIRTIO_BLK_T_OUT);
> +		int nr_seg = (write ? req->out_num : req->in_num) - 1;
> +		unsigned long sector = le64_to_cpu(req->hdr.sector);
> +		ssize_t len, rem_len;
> +
> +		if (!req->q->blk->backend) {
> +			vq_err(vq, "blk %p no backend!\n", req->q->blk);
> +			ret = -EINVAL;
> +			goto out_err;
> +		}
> +
> +		len = iov_length(&vq->iov[1], nr_seg);
> +		pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
> +			 __func__, current->pid, current->comm,
> +			 write ? "WRITE" : "READ", req->hdr.sector, len);
> +
> +		req->len = len;
> +		rem_len = len;
> +		iov_iter_init(&req->i, (write ? WRITE : READ),
> +			      write ? &req->out_iov[0] : &req->in_iov[0],
> +			      nr_seg, len);
> +
> +		req->iocb.ki_pos = sector << 9;
> +		req->iocb.ki_filp = blk->backend;
> +		req->iocb.ki_complete = vhost_blk_iocb_complete;
> +		req->iocb.ki_flags = IOCB_DIRECT;
> +
> +		if (write)
> +			ret = call_write_iter(blk->backend, &req->iocb,
> +					      &req->i);
> +		else
> +			ret = call_read_iter(blk->backend, &req->iocb,
> +					     &req->i);
> +
> +		if (ret != -EIOCBQUEUED)
> +			vhost_blk_iocb_complete(&req->iocb, ret, 0);
> +
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (type == VIRTIO_BLK_T_GET_ID) {
> +		char s[] = "vhost_blk";

Isn't this supposed to return the serial #?

> +		size_t len = min_t(size_t, req->in_iov[0].iov_len,
> +				   strlen(s));
> +
> +		ret = copy_to_user(req->in_iov[0].iov_base, s, len);

I don't think we should assume there's no scatter list here.

> +		status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> +		if (put_user(status, (unsigned char __user *)req->status)) {
> +			ret = -EFAULT;
> +			goto out_err;
> +		}
> +		vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
> +		ret = 0;
> +		goto out;
> +	} else {
> +		pr_warn("Unsupported request type %d\n", type);
> +		vhost_discard_vq_desc(vq, 1);
> +		ret = -EINVAL;
> +		return ret;
> +	}
> +out_err:
> +	vhost_discard_vq_desc(vq, 1);
> +out:
> +	return ret;
> +}
> +
> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
> +{
> +	struct vhost_virtqueue *vq;
> +	struct vhost_blk_queue *q;
> +	struct vhost_blk *blk;
> +	struct vhost_blk_req *req;
> +	int in, out;
> +	int head;
> +
> +	vq = container_of(work, struct vhost_virtqueue, poll.work);
> +	q = container_of(vq, struct vhost_blk_queue, vq);
> +	blk = container_of(vq->dev, struct vhost_blk, dev);
> +
> +	vhost_disable_notify(&blk->dev, vq);
> +	for (;;) {
> +		in = out = -1;
> +
> +		head = vhost_get_vq_desc(vq, vq->iov,
> +					 ARRAY_SIZE(vq->iov),
> +					 &out, &in, NULL, NULL);
> +
> +		if (head < 0)
> +			break;
> +
> +		if (head == vq->num) {
> +			if (vhost_enable_notify(&blk->dev, vq)) {
> +				vhost_disable_notify(&blk->dev, vq);
> +				continue;
> +			}
> +			break;
> +		}
> +
> +		req = &q->req[head];
> +		req->index = head;
> +		req->out_num = out;
> +		req->in_num = in;
> +		req->out_iov = &vq->iov[1];
> +		req->in_iov = &vq->iov[out];
> +		req->status = vq->iov[out + in - 1].iov_base;

Shouldn't we validate that there's actually an in?

> +
> +		if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
> +				   sizeof(req->hdr))) {
> +			vq_err(vq, "Failed to get block header!\n");
> +			vhost_discard_vq_desc(vq, 1);
> +			continue;
> +		}

It's better to avoid assuming that header is in a single iov entry,
use an iterator.

> +		if (vhost_blk_req_handle(req) < 0)
> +			break;
> +	}
> +}
> +
> +static int vhost_blk_open(struct inode *inode, struct file *file)
> +{
> +	struct vhost_blk *blk;
> +	struct vhost_blk_queue *q;
> +	int i, j;
> +
> +	blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
> +	if (!blk)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
> +		q = &blk->queue[i];
> +		q->index = i;
> +		q->blk = blk;
> +		q->vq.handle_kick = vhost_blk_handle_guest_kick;
> +		vhost_work_init(&q->w, vhost_blk_io_done_work);
> +		blk->vqs[i] = &q->vq;
> +		for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
> +			q->req[j].index = j;
> +			q->req[j].q = q;
> +		}
> +	}
> +	vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
> +		       VHOST_BLK_VQ_MAX);
> +	file->private_data = blk;
> +
> +	return 0;
> +}
> +
> +static int vhost_blk_release(struct inode *inode, struct file *f)
> +{
> +	struct vhost_blk *blk = f->private_data;
> +
> +	vhost_blk_stop(blk);
> +	mutex_lock(&blk->dev.mutex);
> +	vhost_blk_flush(blk);
> +	vhost_dev_stop(&blk->dev);
> +	vhost_dev_cleanup(&blk->dev);
> +	vhost_blk_flush(blk);
> +
> +	if (blk->backend) {
> +		fput(blk->backend);
> +		blk->backend = NULL;
> +	}
> +
> +	mutex_unlock(&blk->dev.mutex);
> +	kvfree(blk);
> +
> +	return 0;
> +}
> +
> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
> +{
> +	int i;
> +	int ret = -EFAULT;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	if ((features & (1 << VHOST_F_LOG_ALL)) &&
> +	    !vhost_log_access_ok(&blk->dev))
> +		goto out_unlock;
> +
> +	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
> +		if (vhost_init_device_iotlb(&blk->dev, true))
> +			goto out_unlock;
> +	}
> +
> +	for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
> +		struct vhost_virtqueue *vq = blk->vqs[i];
> +
> +		mutex_lock(&vq->mutex);
> +		vq->acked_features = features & VHOST_BLK_FEATURES;
> +		mutex_unlock(&vq->mutex);
> +	}
> +	ret = 0;
> +out_unlock:
> +	mutex_unlock(&blk->dev.mutex);
> +
> +	return ret;
> +}
> +
> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
> +{
> +	long err;
> +	struct vhost_umem *umem;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	err = vhost_dev_check_owner(&blk->dev);
> +	if (err)
> +		goto done;
> +	umem = vhost_dev_reset_owner_prepare();
> +	if (!umem) {
> +		err = -ENOMEM;
> +		goto done;
> +	}
> +	vhost_blk_stop(blk);
> +	vhost_blk_flush(blk);
> +	vhost_dev_reset_owner(&blk->dev, umem);
> +done:
> +	mutex_unlock(&blk->dev.mutex);
> +	return err;
> +}
> +
> +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
> +{
> +	struct file *backend;
> +	int ret, i;
> +	struct vhost_virtqueue *vq;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	ret = vhost_dev_check_owner(&blk->dev);
> +	if (ret)
> +		goto out_dev;
> +
> +	backend = fget(fd);
> +	if (IS_ERR(backend)) {
> +		ret = PTR_ERR(backend);
> +		goto out_dev;
> +	}
> +
> +	if (backend == blk->backend) {
> +		ret = 0;
> +		goto out_file;
> +	}
> +
> +	if (blk->backend)
> +		fput(blk->backend);
> +	blk->backend = backend;
> +	for (i = 0; i < blk->num_queues; i++) {
> +		vq = &blk->queue[i].vq;
> +		if (!vhost_vq_access_ok(vq)) {
> +			ret = -EFAULT;
> +			goto out_file;
> +		}
> +		mutex_lock(&vq->mutex);
> +		rcu_assign_pointer(vq->private_data, backend);
> +		ret = vhost_vq_init_access(vq);
> +		mutex_unlock(&vq->mutex);
> +		if (ret) {
> +			pr_err("vhost_vq_init_access failed: %d\n", ret);
> +			goto out_file;
> +		}
> +
> +	}
> +	ret = 0;
> +	goto out_dev;
> +out_file:
> +	fput(backend);
> +	blk->backend = NULL;
> +out_dev:
> +	mutex_unlock(&blk->dev.mutex);
> +	vhost_blk_flush(blk);
> +	return ret;
> +}
> +
> +static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
> +				 void __user *argp)
> +{
> +	long ret;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
> +	if (ret == -ENOIOCTLCMD)
> +		ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
> +	else
> +		vhost_blk_flush(blk);
> +	mutex_unlock(&blk->dev.mutex);
> +	return ret;
> +}
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> +			    unsigned long arg)
> +{
> +	struct vhost_blk *blk = f->private_data;
> +	void __user *argp = (void __user *)arg;
> +	int fd;
> +	u64 __user *featurep = argp;
> +	u64 features;
> +	long ret;
> +	struct vhost_vring_state s;
> +
> +	switch (ioctl) {
> +	case VHOST_SET_MEM_TABLE:
> +		vhost_blk_stop(blk);
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		break;
> +	case VHOST_SET_VRING_NUM:
> +		if (copy_from_user(&s, argp, sizeof(s)))
> +			return -EFAULT;
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		if (!ret)
> +			blk->num_queues = s.index + 1;
> +		break;
> +	case VHOST_BLK_SET_BACKEND:
> +		if (copy_from_user(&fd, argp, sizeof(fd)))
> +			return -EFAULT;
> +		ret = vhost_blk_set_backend(blk, fd);
> +		break;
> +	case VHOST_GET_FEATURES:
> +		features = VHOST_BLK_FEATURES;
> +		if (copy_to_user(featurep, &features, sizeof(features)))
> +			return -EFAULT;
> +		ret = 0;
> +		break;
> +	case VHOST_SET_FEATURES:
> +		if (copy_from_user(&features, featurep, sizeof(features)))
> +			return -EFAULT;
> +		if (features & ~VHOST_BLK_FEATURES)
> +			return -EOPNOTSUPP;
> +		ret = vhost_blk_set_features(blk, features);
> +		break;
> +	case VHOST_RESET_OWNER:
> +		ret = vhost_blk_reset_owner(blk);
> +		break;
> +	default:
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		break;
> +	}
> +	return ret;
> +}
> +
> +static const struct file_operations vhost_blk_fops = {
> +	.owner          = THIS_MODULE,
> +	.open           = vhost_blk_open,
> +	.release        = vhost_blk_release,
> +	.llseek		= noop_llseek,
> +	.unlocked_ioctl = vhost_blk_ioctl,
> +};
> +
> +static struct miscdevice vhost_blk_misc = {
> +	MISC_DYNAMIC_MINOR,
> +	"vhost-blk",
> +	&vhost_blk_fops,
> +};
> +
> +static int vhost_blk_init(void)
> +{
> +	return misc_register(&vhost_blk_misc);
> +}
> +module_init(vhost_blk_init);
> +
> +static void vhost_blk_exit(void)
> +{
> +	misc_deregister(&vhost_blk_misc);
> +}
> +
> +module_exit(vhost_blk_exit);
> +
> +MODULE_VERSION("1.0");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Vitaly Mayatskikh");
> +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
> +MODULE_ALIAS("devname:vhost-blk");
> -- 
> 2.17.1