On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh wrote: > This driver accelerates host side of virtio-blk. > > Signed-off-by: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx> > --- > drivers/vhost/Kconfig | 13 ++ > drivers/vhost/Makefile | 3 + > drivers/vhost/blk.c | 510 +++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 526 insertions(+) > create mode 100644 drivers/vhost/blk.c > > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig > index b580885243f7..c4980d6af0ea 100644 > --- a/drivers/vhost/Kconfig > +++ b/drivers/vhost/Kconfig > @@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY > adds some overhead, it is disabled by default. > > If unsure, say "N". > + > +config VHOST_BLK > + tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)" > + depends on BLOCK && EVENTFD > + select VHOST > + default n > + help > + This kernel module can be loaded in host kernel to accelerate > + guest block with virtio_blk. Not to be confused with virtio_blk > + module itself which needs to be loaded in guest kernel. > + > + To compile this driver as a module, choose M here: the module will > + be called vhost_blk. > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile > index 6c6df24f770c..c8be36cd9214 100644 > --- a/drivers/vhost/Makefile > +++ b/drivers/vhost/Makefile > @@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o > obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o > vhost_vsock-y := vsock.o > > +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o > +vhost_blk-y := blk.o > + > obj-$(CONFIG_VHOST_RING) += vringh.o > > obj-$(CONFIG_VHOST) += vhost.o > diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c > new file mode 100644 > index 000000000000..aefb9a61fa0f > --- /dev/null > +++ b/drivers/vhost/blk.c > @@ -0,0 +1,510 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2018 IBM Corporation > + * Author: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * > + * virtio-blk server in host kernel. > + */ > + > +#include <linux/module.h> > +#include <linux/miscdevice.h> > +#include <linux/virtio_blk.h> > +#include <linux/vhost.h> > +#include <linux/fs.h> > +#include "vhost.h" > + > +enum { > + VHOST_BLK_FEATURES = > + VHOST_FEATURES | > + (1ULL << VIRTIO_F_IOMMU_PLATFORM) | > + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | > + (1ULL << VIRTIO_RING_F_EVENT_IDX) | > + (1ULL << VIRTIO_BLK_F_MQ) > +}; > + > +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int) > + > +enum { > + VHOST_BLK_VQ_MAX = 16, > + VHOST_BLK_VQ_MAX_REQS = 128, > +}; > + > +struct vhost_blk_req { > + struct llist_node list; > + int index; > + struct vhost_blk_queue *q; > + struct virtio_blk_outhdr hdr; > + struct iovec *out_iov; > + struct iovec *in_iov; > + u8 out_num; > + u8 in_num; > + long len; > + struct kiocb iocb; > + struct iov_iter i; > + int res; > + void __user *status; > +}; > + > +struct vhost_blk_queue { > + int index; > + struct vhost_blk *blk; > + struct vhost_virtqueue vq; > + struct vhost_work w; > + struct llist_head wl; > + struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS]; > +}; > + > +struct vhost_blk { > + struct vhost_dev dev; > + struct file *backend; > + int num_queues; > + struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX]; > + struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX]; > +}; > + > +static void vhost_blk_flush(struct vhost_blk *blk) > +{ > + int i; > + > + for (i = 0; i < blk->num_queues; i++) > + vhost_poll_flush(&blk->queue[i].vq.poll); > +} > + > + > +static void vhost_blk_stop(struct vhost_blk *blk) > +{ > + struct vhost_virtqueue *vq; > + int i; > + > + for (i = 0; i < blk->num_queues; i++) { > + vq = &blk->queue[i].vq; > + mutex_lock(&vq->mutex); > + rcu_assign_pointer(vq->private_data, NULL); > + mutex_unlock(&vq->mutex); > + } > +} > + > +static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status) > +{ > + int ret; > + int len = req->len; > + > + pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__, > + req->q->index, req->index, status, len); > + ret = put_user(status, (unsigned char __user *)req->status); I'd make it u8 and not unsigned char. Also why don't you change req->status type so you don't need a cast? > + > + WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__, > + req->q->index, req->index); kernel warnings and debug messages that are guest-triggerable lead to disk full errors on the host. applies elsewhere too. you want traces instead. > + > + vhost_add_used(&req->q->vq, req->index, len); this can fail too. > + > + return ret; > +} > + > +static void vhost_blk_io_done_work(struct vhost_work *w) > +{ > + struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w); > + struct llist_node *node; > + struct vhost_blk_req *req, *tmp; > + > + node = llist_del_all(&q->wl); > + llist_for_each_entry_safe(req, tmp, node, list) { > + vhost_blk_req_done(req, req->res); > + } > + vhost_signal(&q->blk->dev, &q->vq); > +} > + > +static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2) > +{ > + struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req, > + iocb); > + > + pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__, > + req->q->index, req->index, ret, ret2); > + > + req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; > + llist_add(&req->list, &req->q->wl); > + vhost_vq_work_queue(&req->q->vq, &req->q->w); > +} > + > +static int vhost_blk_req_handle(struct vhost_blk_req *req) > +{ > + struct vhost_blk *blk = req->q->blk; > + struct vhost_virtqueue *vq = &req->q->vq; > + int type = le32_to_cpu(req->hdr.type); > + int ret; > + u8 status; > + > + if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) { > + bool write = (type == VIRTIO_BLK_T_OUT); > + int nr_seg = (write ? req->out_num : req->in_num) - 1; > + unsigned long sector = le64_to_cpu(req->hdr.sector); > + ssize_t len, rem_len; > + > + if (!req->q->blk->backend) { > + vq_err(vq, "blk %p no backend!\n", req->q->blk); > + ret = -EINVAL; > + goto out_err; > + } > + > + len = iov_length(&vq->iov[1], nr_seg); > + pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n", > + __func__, current->pid, current->comm, > + write ? "WRITE" : "READ", req->hdr.sector, len); > + > + req->len = len; > + rem_len = len; > + iov_iter_init(&req->i, (write ? WRITE : READ), > + write ? &req->out_iov[0] : &req->in_iov[0], > + nr_seg, len); > + > + req->iocb.ki_pos = sector << 9; > + req->iocb.ki_filp = blk->backend; > + req->iocb.ki_complete = vhost_blk_iocb_complete; > + req->iocb.ki_flags = IOCB_DIRECT; > + > + if (write) > + ret = call_write_iter(blk->backend, &req->iocb, > + &req->i); > + else > + ret = call_read_iter(blk->backend, &req->iocb, > + &req->i); > + > + if (ret != -EIOCBQUEUED) > + vhost_blk_iocb_complete(&req->iocb, ret, 0); > + > + ret = 0; > + goto out; > + } > + > + if (type == VIRTIO_BLK_T_GET_ID) { > + char s[] = "vhost_blk"; Isn't this supposed to return the serial #? > + size_t len = min_t(size_t, req->in_iov[0].iov_len, > + strlen(s)); > + > + ret = copy_to_user(req->in_iov[0].iov_base, s, len); I don't think we should assume there's no scatter list here. > + status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; > + if (put_user(status, (unsigned char __user *)req->status)) { > + ret = -EFAULT; > + goto out_err; > + } > + vhost_add_used_and_signal(&blk->dev, vq, req->index, 1); > + ret = 0; > + goto out; > + } else { > + pr_warn("Unsupported request type %d\n", type); > + vhost_discard_vq_desc(vq, 1); > + ret = -EINVAL; > + return ret; > + } > +out_err: > + vhost_discard_vq_desc(vq, 1); > +out: > + return ret; > +} > + > +static void vhost_blk_handle_guest_kick(struct vhost_work *work) > +{ > + struct vhost_virtqueue *vq; > + struct vhost_blk_queue *q; > + struct vhost_blk *blk; > + struct vhost_blk_req *req; > + int in, out; > + int head; > + > + vq = container_of(work, struct vhost_virtqueue, poll.work); > + q = container_of(vq, struct vhost_blk_queue, vq); > + blk = container_of(vq->dev, struct vhost_blk, dev); > + > + vhost_disable_notify(&blk->dev, vq); > + for (;;) { > + in = out = -1; > + > + head = vhost_get_vq_desc(vq, vq->iov, > + ARRAY_SIZE(vq->iov), > + &out, &in, NULL, NULL); > + > + if (head < 0) > + break; > + > + if (head == vq->num) { > + if (vhost_enable_notify(&blk->dev, vq)) { > + vhost_disable_notify(&blk->dev, vq); > + continue; > + } > + break; > + } > + > + req = &q->req[head]; > + req->index = head; > + req->out_num = out; > + req->in_num = in; > + req->out_iov = &vq->iov[1]; > + req->in_iov = &vq->iov[out]; > + req->status = vq->iov[out + in - 1].iov_base; Shouldn't we validate that there's actually an in? > + > + if (copy_from_user(&req->hdr, vq->iov[0].iov_base, > + sizeof(req->hdr))) { > + vq_err(vq, "Failed to get block header!\n"); > + vhost_discard_vq_desc(vq, 1); > + continue; > + } It's better to avoid assuming that header is in a single iov entry, use an iterator. > + if (vhost_blk_req_handle(req) < 0) > + break; > + } > +} > + > +static int vhost_blk_open(struct inode *inode, struct file *file) > +{ > + struct vhost_blk *blk; > + struct vhost_blk_queue *q; > + int i, j; > + > + blk = kvzalloc(sizeof(*blk), GFP_KERNEL); > + if (!blk) > + return -ENOMEM; > + > + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { > + q = &blk->queue[i]; > + q->index = i; > + q->blk = blk; > + q->vq.handle_kick = vhost_blk_handle_guest_kick; > + vhost_work_init(&q->w, vhost_blk_io_done_work); > + blk->vqs[i] = &q->vq; > + for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) { > + q->req[j].index = j; > + q->req[j].q = q; > + } > + } > + vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs, > + VHOST_BLK_VQ_MAX); > + file->private_data = blk; > + > + return 0; > +} > + > +static int vhost_blk_release(struct inode *inode, struct file *f) > +{ > + struct vhost_blk *blk = f->private_data; > + > + vhost_blk_stop(blk); > + mutex_lock(&blk->dev.mutex); > + vhost_blk_flush(blk); > + vhost_dev_stop(&blk->dev); > + vhost_dev_cleanup(&blk->dev); > + vhost_blk_flush(blk); > + > + if (blk->backend) { > + fput(blk->backend); > + blk->backend = NULL; > + } > + > + mutex_unlock(&blk->dev.mutex); > + kvfree(blk); > + > + return 0; > +} > + > +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features) > +{ > + int i; > + int ret = -EFAULT; > + > + mutex_lock(&blk->dev.mutex); > + if ((features & (1 << VHOST_F_LOG_ALL)) && > + !vhost_log_access_ok(&blk->dev)) > + goto out_unlock; > + > + if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { > + if (vhost_init_device_iotlb(&blk->dev, true)) > + goto out_unlock; > + } > + > + for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) { > + struct vhost_virtqueue *vq = blk->vqs[i]; > + > + mutex_lock(&vq->mutex); > + vq->acked_features = features & VHOST_BLK_FEATURES; > + mutex_unlock(&vq->mutex); > + } > + ret = 0; > +out_unlock: > + mutex_unlock(&blk->dev.mutex); > + > + return ret; > +} > + > +static long vhost_blk_reset_owner(struct vhost_blk *blk) > +{ > + long err; > + struct vhost_umem *umem; > + > + mutex_lock(&blk->dev.mutex); > + err = vhost_dev_check_owner(&blk->dev); > + if (err) > + goto done; > + umem = vhost_dev_reset_owner_prepare(); > + if (!umem) { > + err = -ENOMEM; > + goto done; > + } > + vhost_blk_stop(blk); > + vhost_blk_flush(blk); > + vhost_dev_reset_owner(&blk->dev, umem); > +done: > + mutex_unlock(&blk->dev.mutex); > + return err; > +} > + > +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd) > +{ > + struct file *backend; > + int ret, i; > + struct vhost_virtqueue *vq; > + > + mutex_lock(&blk->dev.mutex); > + ret = vhost_dev_check_owner(&blk->dev); > + if (ret) > + goto out_dev; > + > + backend = fget(fd); > + if (IS_ERR(backend)) { > + ret = PTR_ERR(backend); > + goto out_dev; > + } > + > + if (backend == blk->backend) { > + ret = 0; > + goto out_file; > + } > + > + if (blk->backend) > + fput(blk->backend); > + blk->backend = backend; > + for (i = 0; i < blk->num_queues; i++) { > + vq = &blk->queue[i].vq; > + if (!vhost_vq_access_ok(vq)) { > + ret = -EFAULT; > + goto out_file; > + } > + mutex_lock(&vq->mutex); > + rcu_assign_pointer(vq->private_data, backend); > + ret = vhost_vq_init_access(vq); > + mutex_unlock(&vq->mutex); > + if (ret) { > + pr_err("vhost_vq_init_access failed: %d\n", ret); > + goto out_file; > + } > + > + } > + ret = 0; > + goto out_dev; > +out_file: > + fput(backend); > + blk->backend = NULL; > +out_dev: > + mutex_unlock(&blk->dev.mutex); > + vhost_blk_flush(blk); > + return ret; > +} > + > +static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl, > + void __user *argp) > +{ > + long ret; > + > + mutex_lock(&blk->dev.mutex); > + ret = vhost_dev_ioctl(&blk->dev, ioctl, argp); > + if (ret == -ENOIOCTLCMD) > + ret = vhost_vring_ioctl(&blk->dev, ioctl, argp); > + else > + vhost_blk_flush(blk); > + mutex_unlock(&blk->dev.mutex); > + return ret; > +} > + > +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, > + unsigned long arg) > +{ > + struct vhost_blk *blk = f->private_data; > + void __user *argp = (void __user *)arg; > + int fd; > + u64 __user *featurep = argp; > + u64 features; > + long ret; > + struct vhost_vring_state s; > + > + switch (ioctl) { > + case VHOST_SET_MEM_TABLE: > + vhost_blk_stop(blk); > + ret = vhost_blk_pass_ioctl(blk, ioctl, argp); > + break; > + case VHOST_SET_VRING_NUM: > + if (copy_from_user(&s, argp, sizeof(s))) > + return -EFAULT; > + ret = vhost_blk_pass_ioctl(blk, ioctl, argp); > + if (!ret) > + blk->num_queues = s.index + 1; > + break; > + case VHOST_BLK_SET_BACKEND: > + if (copy_from_user(&fd, argp, sizeof(fd))) > + return -EFAULT; > + ret = vhost_blk_set_backend(blk, fd); > + break; > + case VHOST_GET_FEATURES: > + features = VHOST_BLK_FEATURES; > + if (copy_to_user(featurep, &features, sizeof(features))) > + return -EFAULT; > + ret = 0; > + break; > + case VHOST_SET_FEATURES: > + if (copy_from_user(&features, featurep, sizeof(features))) > + return -EFAULT; > + if (features & ~VHOST_BLK_FEATURES) > + return -EOPNOTSUPP; > + ret = vhost_blk_set_features(blk, features); > + break; > + case VHOST_RESET_OWNER: > + ret = vhost_blk_reset_owner(blk); > + break; > + default: > + ret = vhost_blk_pass_ioctl(blk, ioctl, argp); > + break; > + } > + return ret; > +} > + > +static const struct file_operations vhost_blk_fops = { > + .owner = THIS_MODULE, > + .open = vhost_blk_open, > + .release = vhost_blk_release, > + .llseek = noop_llseek, > + .unlocked_ioctl = vhost_blk_ioctl, > +}; > + > +static struct miscdevice vhost_blk_misc = { > + MISC_DYNAMIC_MINOR, > + "vhost-blk", > + &vhost_blk_fops, > +}; > + > +static int vhost_blk_init(void) > +{ > + return misc_register(&vhost_blk_misc); > +} > +module_init(vhost_blk_init); > + > +static void vhost_blk_exit(void) > +{ > + misc_deregister(&vhost_blk_misc); > +} > + > +module_exit(vhost_blk_exit); > + > +MODULE_VERSION("1.0"); > +MODULE_LICENSE("GPL v2"); > +MODULE_AUTHOR("Vitaly Mayatskikh"); > +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk"); > +MODULE_ALIAS("devname:vhost-blk"); > -- > 2.17.1