Hi Christoph, I am wondering if you can provide your thoughts here.. I modified my vhost-blk implementation to offload work to work_queues instead of doing synchronously. Infact, I tried to spread the work across all the CPUs. But to my surprise, this did not improve the performance compared to virtio-blk. I see vhost-blk taking more interrupts and context switches compared to virtio-blk. What is virtio-blk doing which I am not able to from vhost-blk ??? Thanks, Badari vhost-blk procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 3 1 8920 56076 20760 5603556 0 104 196 79826 17164 13912 0 5 65 30 0 2 4 9488 57216 20744 5605616 0 114 195 81120 17397 13824 0 5 65 30 0 2 2 10028 68476 20728 5594764 0 108 206 80318 17162 13845 0 5 65 30 0 0 4 10560 70856 20708 5593088 0 106 205 82363 17402 13904 0 5 65 30 0 1 3 10948 80380 20672 5584452 0 78 178 79714 17113 13875 0 5 66 29 0 qemu virtio-blk: procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 0 1 14124 57456 5144 4924060 0 0 139 142546 11287 9312 1 4 80 15 0 0 2 14124 56736 5148 4927396 0 0 146 142968 11283 9248 1 4 80 15 0 0 1 14124 56712 5384 4927020 0 0 74 150738 11182 9327 1 4 80 16 0 1 1 14124 55496 5392 4927904 0 0 2 159902 11172 9401 1 3 79 17 0 0 1 14124 55968 5408 4927232 0 0 0 159202 11212 9325 1 3 80 16 0 --- drivers/vhost/blk.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) Index: net-next/drivers/vhost/blk.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ net-next/drivers/vhost/blk.c 2010-03-25 20:06:57.484054770 -0400 @@ -0,0 +1,310 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/virtio_blk.h> +#include <linux/mmu_context.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/rcupdate.h> +#include <linux/file.h> + +#include "vhost.h" + +#define VHOST_BLK_VQ_MAX 1 + +#if 0 +#define myprintk(fmt, ...) printk(pr_fmt(fmt), ##__VA_ARGS__) +#else +#define myprintk(fmt, ...) +#endif + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +struct vhost_blk_io { + struct work_struct work; + struct vhost_blk *blk; + struct file *file; + int head; + uint32_t type; + uint64_t sector; + struct iovec *iov; + int nvecs; +}; + +static struct workqueue_struct *vblk_workqueue; + +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + int i, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio->blk; + vq = &blk->dev.vqs[0]; + pos = vbio->sector << 8; + + use_mm(blk->dev.mm); + + if (vbio->type & VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1); + } else if (vbio->type & VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos); + } else { + ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos); + } + + status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (copy_to_user(vbio->iov[vbio->nvecs].iov_base, &status, sizeof status) < 0) { + printk("copy to user failed\n"); + vhost_discard_vq_desc(vq); + unuse_mm(blk->dev.mm); + return; + } + mutex_lock(&vq->mutex); + vhost_add_used_and_signal(&blk->dev, vq, vbio->head, ret); + mutex_unlock(&vq->mutex); + unuse_mm(blk->dev.mm); + kfree(vbio); +} + +static int cpu = 0; +static int handoff_io(struct vhost_blk *blk, int head, + uint32_t type, uint64_t sector, + struct iovec *iov, int nvecs) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + struct vhost_blk_io *vbio; + + vbio = kmalloc(sizeof(struct vhost_blk_io), GFP_KERNEL); + if (!vbio) + return -ENOMEM; + + INIT_WORK(&vbio->work, handle_io_work); + vbio->blk = blk; + vbio->file = vq->private_data; + vbio->head = head; + vbio->type = type; + vbio->sector = sector; + vbio->iov = iov; + vbio->nvecs = nvecs; + + cpu = cpumask_next(cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + queue_work_on(cpu, vblk_workqueue, &vbio->work); + + return 0; +} + + +static void handle_blk(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + unsigned head, out, in; + struct virtio_blk_outhdr hdr; + int r, nvecs; + + use_mm(blk->dev.mm); + mutex_lock(&vq->mutex); + + vhost_disable_notify(vq); + + for (;;) { + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head == vq->num) { + if (unlikely(vhost_enable_notify(vq))) { + vhost_disable_notify(vq); + continue; + } + break; + } + + BUG_ON(vq->iov[0].iov_len != 16); + + r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr); + if (r < 0) { + printk("copy from user failed\n"); + vhost_discard_vq_desc(vq); + break; + } + + nvecs = out - 1; + if (hdr.type == VIRTIO_BLK_T_IN) + nvecs = in - 1; + + BUG_ON(vq->iov[nvecs+1].iov_len != 1); + r = handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs); + if (r < 0) { + vhost_discard_vq_desc(vq); + break; + } + } + mutex_unlock(&vq->mutex); + unuse_mm(blk->dev.mm); +} + +static void vhost_blk_flush(struct vhost_blk *n) +{ + vhost_poll_flush(n->poll); + vhost_poll_flush(&n->dev.vqs[0].poll); +} + +static void handle_blk_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + vq = container_of(work, struct vhost_virtqueue, poll.work); + blk = container_of(vq->dev, struct vhost_blk, dev); + handle_blk(blk); +} + +static void handle_rq_blk(struct work_struct *work) +{ + struct vhost_blk *blk; + blk = container_of(work, struct vhost_blk, poll[0].work); + handle_blk(blk); +} + +static int vhost_blk_open(struct inode *inode, struct file *f) +{ + struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL); + int r; + if (!n) + return -ENOMEM; + n->vqs[0].handle_kick = handle_blk_kick; + r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX); + if (r < 0) { + kfree(n); + return r; + } + + vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN); + f->private_data = n; + return 0; +} + +static int vhost_blk_release(struct inode *inode, struct file *f) +{ + struct vhost_blk *n = f->private_data; + + fput(n->vqs->private_data); + kfree(n); + return 0; +} + +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd) +{ + struct file *file; + struct vhost_virtqueue *vq; + + file = fget(fd); + if (!file) + return -EBADF; + + vq = n->vqs + index; + mutex_lock(&vq->mutex); + rcu_assign_pointer(vq->private_data, file); + mutex_unlock(&vq->mutex); + return 0; +} + + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_blk *n = f->private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + int r; + + switch (ioctl) { + case VHOST_NET_SET_BACKEND: + r = copy_from_user(&backend, argp, sizeof backend); + if (r < 0) + return r; + return vhost_blk_set_backend(n, backend.index, backend.fd); + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, arg); + vhost_blk_flush(n); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +const static struct file_operations vhost_blk_fops = { + .owner = THIS_MODULE, + .release = vhost_blk_release, + .open = vhost_blk_open, + .unlocked_ioctl = vhost_blk_ioctl, +}; + +static struct miscdevice vhost_blk_misc = { + 234, + "vhost-blk", + &vhost_blk_fops, +}; + +int vhost_blk_init(void) +{ + int r = vhost_init(); + if (r) + goto err_init; + + vblk_workqueue = create_workqueue("vblk"); + if (!vblk_workqueue) { + r = -ENOMEM; + goto err_vblk; + } + + r = misc_register(&vhost_blk_misc); + if (r) + goto err_reg; + return 0; +err_reg: + destroy_workqueue(vblk_workqueue); +err_vblk: + vhost_cleanup(); +err_init: + return r; + +} +module_init(vhost_blk_init); + +void vhost_blk_exit(void) +{ + misc_deregister(&vhost_blk_misc); + destroy_workqueue(vblk_workqueue); + vhost_cleanup(); +} +module_exit(vhost_blk_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk"); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html