On Thu, Jul 19, 2012 at 08:05:42AM -0500, Anthony Liguori wrote: > Asias He <asias@xxxxxxxxxx> writes: > > > vhost-blk is a in kernel virito-blk device accelerator. > > > > This patch is based on Liu Yuan's implementation with various > > improvements and bug fixes. Notably, this patch makes guest notify and > > host completion processing in parallel which gives about 60% performance > > improvement compared to Liu Yuan's implementation. > > > > Performance evaluation: > > ----------------------------- > > The comparison is between kvm tool with usersapce implementation and kvm > > tool with vhost-blk. > > > > 1) Fio with libaio ioengine on Fusion IO device > > With bio-based IO path, sequential read/write, random read/write > > IOPS boost : 8.4%, 15.3%, 10.4%, 14.6% > > Latency improvement: 8.5%, 15.4%, 10.4%, 15.1% > > > > 2) Fio with vsync ioengine on Fusion IO device > > With bio-based IO path, sequential read/write, random read/write > > IOPS boost : 10.5%, 4.8%, 5.2%, 5.6% > > Latency improvement: 11.4%, 5.0%, 5.2%, 5.8% > > > > Cc: Michael S. Tsirkin <mst@xxxxxxxxxx> > > Cc: linux-kernel@xxxxxxxxxxxxxxx > > Cc: kvm@xxxxxxxxxxxxxxx > > Cc: virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx > > Signed-off-by: Asias He <asias@xxxxxxxxxx> > > --- > > drivers/vhost/Kconfig | 10 + > > drivers/vhost/Makefile | 2 + > > drivers/vhost/blk.c | 600 ++++++++++++++++++++++++++++++++++++++++++++++++ > > drivers/vhost/vhost.h | 5 + > > include/linux/vhost.h | 3 + > > 5 files changed, 620 insertions(+) > > create mode 100644 drivers/vhost/blk.c > > > > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig > > index c387067..fa071a8 100644 > > --- a/drivers/vhost/Kconfig > > +++ b/drivers/vhost/Kconfig > > @@ -16,4 +16,14 @@ config VHOST_NET > > > > To compile this driver as a module, choose M here: the module will > > be called vhost_net. > > +config VHOST_BLK > > + tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)" > > + depends on VHOST && BLOCK && AIO && EVENTFD && EXPERIMENTAL > > + ---help--- > > + This kernel module can be loaded in host kernel to accelerate > > + guest block with virtio_blk. Not to be confused with virtio_blk > > + module itself which needs to be loaded in guest kernel. > > + > > + To compile this driver as a module, choose M here: the module will > > + be called vhost_blk. > > > > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile > > index cd36885..aa461d5 100644 > > --- a/drivers/vhost/Makefile > > +++ b/drivers/vhost/Makefile > > @@ -1,4 +1,6 @@ > > obj-$(CONFIG_VHOST) += vhost.o > > obj-$(CONFIG_VHOST_NET) += vhost_net.o > > +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o > > > > vhost_net-y := net.o > > +vhost_blk-y := blk.o > > diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c > > new file mode 100644 > > index 0000000..6a94894 > > --- /dev/null > > +++ b/drivers/vhost/blk.c > > @@ -0,0 +1,600 @@ > > +/* > > + * Copyright (C) 2011 Taobao, Inc. > > + * Author: Liu Yuan <tailai.ly@xxxxxxxxxx> > > + * > > + * Copyright (C) 2012 Red Hat, Inc. > > + * Author: Asias He <asias@xxxxxxxxxx> > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2. > > + * > > + * virtio-blk server in host kernel. > > + */ > > + > > +#include <linux/miscdevice.h> > > +#include <linux/module.h> > > +#include <linux/vhost.h> > > +#include <linux/virtio_blk.h> > > +#include <linux/eventfd.h> > > +#include <linux/mutex.h> > > +#include <linux/file.h> > > +#include <linux/mmu_context.h> > > +#include <linux/anon_inodes.h> > > +#include <linux/kthread.h> > > +#include <linux/blkdev.h> > > + > > +#include "vhost.h" > > + > > +#define BLK_HDR 0 > > + > > +enum { > > + VHOST_BLK_VQ_REQ = 0, > > + VHOST_BLK_VQ_MAX = 1, > > +}; > > + > > +struct vhost_blk_req { > > + u16 head; > > + u8 *status; > > +}; > > + > > +struct vhost_blk { > > + struct task_struct *worker_host_kick; > > + struct task_struct *worker; > > + struct vhost_blk_req *reqs; > > + struct vhost_virtqueue vq; > > + struct eventfd_ctx *ectx; > > + struct io_event *ioevent; > > + struct kioctx *ioctx; > > + struct vhost_dev dev; > > + struct file *efile; > > + u64 ioevent_nr; > > + bool stop; > > +}; > > + > > +static inline int vhost_blk_read_events(struct vhost_blk *blk, long nr) > > +{ > > + mm_segment_t old_fs = get_fs(); > > + int ret; > > + > > + set_fs(KERNEL_DS); > > + ret = read_events(blk->ioctx, nr, nr, blk->ioevent, NULL); > > + set_fs(old_fs); > > + > > + return ret; > > +} > > + > > +static int vhost_blk_setup(struct vhost_blk *blk) > > +{ > > + struct kioctx *ctx; > > + > > + if (blk->ioctx) > > + return 0; > > + > > + blk->ioevent_nr = blk->vq.num; > > + ctx = ioctx_alloc(blk->ioevent_nr); > > + if (IS_ERR(ctx)) { > > + pr_err("Failed to ioctx_alloc"); > > + return PTR_ERR(ctx); > > + } > > Not that it's very likely that ioctx_alloc will fail in practice. > There's a fixed number of events that can be allocated that's currently > 0x10000. If you have a ring queue size of 1024 (which is normal) then > that limits you to 64 vhost-blk devices. > > Realistically, I don't think you can only do aio with vhost-blk because > of this (and many other) limitations. It's necessary to be able to fall > back to a thread pool because AIO cannot be relied upon. > > > + put_ioctx(ctx); > > + blk->ioctx = ctx; > > + > > + blk->ioevent = kmalloc(sizeof(struct io_event) * blk->ioevent_nr, > > + GFP_KERNEL); > > + if (!blk->ioevent) { > > + pr_err("Failed to allocate memory for io_events"); > > + return -ENOMEM; > > + } > > + > > + blk->reqs = kmalloc(sizeof(struct vhost_blk_req) * blk->ioevent_nr, > > + GFP_KERNEL); > > + if (!blk->reqs) { > > + pr_err("Failed to allocate memory for vhost_blk_req"); > > + return -ENOMEM; > > + } > > + > > + return 0; > > +} > > + > > +static inline int vhost_blk_set_status(struct vhost_blk *blk, u8 *statusp, > > + u8 status) > > +{ > > + if (copy_to_user(statusp, &status, sizeof(status))) { > > + vq_err(&blk->vq, "Failed to write status\n"); > > + vhost_discard_vq_desc(&blk->vq, 1); > > + return -EFAULT; > > + } > > + > > + return 0; > > +} > > + > > +static void vhost_blk_enable_vq(struct vhost_blk *blk, > > + struct vhost_virtqueue *vq) > > +{ > > + wake_up_process(blk->worker_host_kick); > > +} > > + > > +static int vhost_blk_io_submit(struct vhost_blk *blk, struct file *file, > > + struct vhost_blk_req *req, > > + struct iovec *iov, u64 nr_vecs, loff_t offset, > > + int opcode) > > +{ > > + struct kioctx *ioctx = blk->ioctx; > > + mm_segment_t oldfs = get_fs(); > > + struct kiocb_batch batch; > > + struct blk_plug plug; > > + struct kiocb *iocb; > > + int ret; > > + > > + if (!try_get_ioctx(ioctx)) { > > + pr_info("Failed to get ioctx"); > > + return -EAGAIN; > > + } > > + > > + atomic_long_inc_not_zero(&file->f_count); > > + eventfd_ctx_get(blk->ectx); > > + > > + /* TODO: batch to 1 is not good! */ > > + kiocb_batch_init(&batch, 1); > > + blk_start_plug(&plug); > > + > > + iocb = aio_get_req(ioctx, &batch); > > + if (unlikely(!iocb)) { > > + ret = -EAGAIN; > > + goto out; > > + } > > + > > + iocb->ki_filp = file; > > + iocb->ki_pos = offset; > > + iocb->ki_buf = (void *)iov; > > + iocb->ki_left = nr_vecs; > > + iocb->ki_nbytes = nr_vecs; > > + iocb->ki_opcode = opcode; > > + iocb->ki_obj.user = req; > > + iocb->ki_eventfd = blk->ectx; > > + > > + set_fs(KERNEL_DS); > > + ret = aio_setup_iocb(iocb, false); > > + set_fs(oldfs); > > + if (unlikely(ret)) > > + goto out_put_iocb; > > + > > + spin_lock_irq(&ioctx->ctx_lock); > > + if (unlikely(ioctx->dead)) { > > + spin_unlock_irq(&ioctx->ctx_lock); > > + ret = -EINVAL; > > + goto out_put_iocb; > > + } > > + aio_run_iocb(iocb); > > + spin_unlock_irq(&ioctx->ctx_lock); > > + > > + aio_put_req(iocb); > > + > > + blk_finish_plug(&plug); > > + kiocb_batch_free(ioctx, &batch); > > + put_ioctx(ioctx); > > + > > + return ret; > > +out_put_iocb: > > + aio_put_req(iocb); /* Drop extra ref to req */ > > + aio_put_req(iocb); /* Drop I/O ref to req */ > > +out: > > + put_ioctx(ioctx); > > + return ret; > > +} > > + > > +static void vhost_blk_flush(struct vhost_blk *blk) > > +{ > > + vhost_poll_flush(&blk->vq.poll); > > +} > > + > > +static struct file *vhost_blk_stop_vq(struct vhost_blk *blk, > > + struct vhost_virtqueue *vq) > > +{ > > + struct file *file; > > + > > + mutex_lock(&vq->mutex); > > + file = rcu_dereference_protected(vq->private_data, > > + lockdep_is_held(&vq->mutex)); > > + rcu_assign_pointer(vq->private_data, NULL); > > + mutex_unlock(&vq->mutex); > > + > > + return file; > > + > > +} > > + > > +static inline void vhost_blk_stop(struct vhost_blk *blk, struct file **file) > > +{ > > + > > + *file = vhost_blk_stop_vq(blk, &blk->vq); > > +} > > + > > +/* Handle guest request */ > > +static int vhost_blk_do_req(struct vhost_virtqueue *vq, > > + struct virtio_blk_outhdr *hdr, > > + u16 head, u16 out, u16 in, > > + struct file *file) > > +{ > > + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev); > > + struct iovec *iov = &vq->iov[BLK_HDR + 1]; > > + loff_t offset = hdr->sector << 9; > > + struct vhost_blk_req *req; > > + u64 nr_vecs; > > + int ret = 0; > > + u8 status; > > + > > + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID) > > + nr_vecs = in - 1; > > + else > > + nr_vecs = out - 1; > > + > > + req = &blk->reqs[head]; > > + req->head = head; > > + req->status = blk->vq.iov[nr_vecs + 1].iov_base; > > + > > + switch (hdr->type) { > > + case VIRTIO_BLK_T_OUT: > > + ret = vhost_blk_io_submit(blk, file, req, iov, nr_vecs, offset, > > + IOCB_CMD_PWRITEV); > > + break; > > + case VIRTIO_BLK_T_IN: > > + ret = vhost_blk_io_submit(blk, file, req, iov, nr_vecs, offset, > > + IOCB_CMD_PREADV); > > + break; > > + case VIRTIO_BLK_T_FLUSH: > > + ret = vfs_fsync(file, 1); > > + status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; > > + ret = vhost_blk_set_status(blk, req->status, status); > > + if (!ret) > > + vhost_add_used_and_signal(&blk->dev, vq, head, ret); > > + break; > > + case VIRTIO_BLK_T_GET_ID: > > + /* TODO: need a real ID string */ > > + ret = snprintf(vq->iov[BLK_HDR + 1].iov_base, > > + VIRTIO_BLK_ID_BYTES, "VHOST-BLK-DISK"); > > + status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; > > + ret = vhost_blk_set_status(blk, req->status, status); > > + if (!ret) > > + vhost_add_used_and_signal(&blk->dev, vq, head, > > + VIRTIO_BLK_ID_BYTES); > > + break; > > + default: > > + pr_warn("Unsupported request type %d\n", hdr->type); > > + vhost_discard_vq_desc(vq, 1); > > + ret = -EFAULT; > > + break; > > + } > > There doesn't appear to be any error handling in the event that > vhost_blk_io_submit fails. It would appear that you leak the ring queue > entry since you never push anything onto the used queue. > > I think you need to handle io_submit() failing too with EAGAIN. Relying > on min nr_events == queue_size seems like a dangerous assumption to me > particularly since queue_size tends to be very large and max_nr_events > is a fixed size global pool. > > To properly handle EAGAIN, you effectively need to implement flow > control and back off reading the virtqueue until you can submit requests again. > > Of course, the million dollar question is why would using AIO in the > kernel be faster than using AIO in userspace? > > When using eventfd, there is no heavy weight exit on the notify path. > It should just be the difference between scheduling a kernel thread vs > scheduling a userspace thread. Actually AIO from userspace involves a kernel thread too, doesn't it? > There's simply no way that that's a 60% > difference in performance. > > Regards, > > Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html