[PATCH] Lguest implemention of virtio draft III

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a bonus patch for those wondering how a virtio implementation
can look.  I have two, this is the more efficient one (needs some
modification for inter-guest though: it assumes the other end does all
the accessing of our memory.  It's currently tacked on to the existing
lguest I/O mechanism as a demonstration, rather than replacing it.

It shows that it's possible to implement virtio without internal
locking.

Userspace server-side code isn't included.
===
This allows zero-copy from guest <-> host.  It uses a page of
descriptors, a page to say what descriptors to use, and a page to say
what's been used: one each set for inbufs and one for outbufs.

TODO:
1) More polishing
2) Get rid of old I/O
3) Inter-guest I/O implementation

Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
---
 drivers/lguest/Makefile         |    2 
 drivers/lguest/hypercalls.c     |    4 
 drivers/lguest/lguest_virtio.c  |  511 +++++++++++++++++++++++++++++++++++++++
 include/linux/lguest.h          |    3 
 include/linux/lguest_launcher.h |   24 +
 6 files changed, 948 insertions(+), 5 deletions(-)

--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,5 +1,5 @@
 # Guest requires the paravirt_ops replacement and the bus driver.
-obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o lguest_virtio.o
 
 # Host requires the other files, which can be a module.
 obj-$(CONFIG_LGUEST)	+= lg.o
===================================================================
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -86,6 +86,10 @@ static void do_hcall(struct lguest *lg, 
 		break;
 	case LHCALL_HALT:
 		lg->halted = 1;
+		break;
+	case LHCALL_NOTIFY:
+		lg->pending_key = regs->edx << PAGE_SHIFT;
+		lg->dma_is_pending = 1;
 		break;
 	default:
 		kill_guest(lg, "Bad hypercall %li\n", regs->eax);
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest_virtio.c
@@ -0,0 +1,511 @@
+/* Descriptor-based virtio backend using lguest. */
+
+/* FIXME: Put "running" in shared page so other side really doesn't
+ * send us interrupts.  Then we would never need to "fail" restart.
+ * If there are more buffers when we set "running", simply ping other
+ * side.  It would interrupt us back again.
+ */
+#define DEBUG
+#include <linux/lguest.h>
+#include <linux/lguest_bus.h>
+#include <linux/virtio.h>
+#include <linux/interrupt.h>
+#include <asm/io.h>
+
+#define NUM_DESCS (PAGE_SIZE / sizeof(struct lguest_desc))
+
+#ifdef DEBUG
+/* For development, we want to crash whenever the other side is bad. */
+#define BAD_SIDE(lgv, fmt...)			\
+	do { dev_err(lgv->vdev.dev, fmt); BUG(); } while(0)
+#define START_USE(di) \
+	do { if ((di)->in_use) panic("in_use = %i\n", (di)->in_use); (di)->in_use = __LINE__; mb(); } while(0)
+#define END_USE(di) \
+	do { BUG_ON(!(di)->in_use); (di)->in_use = 0; mb(); } while(0)
+#else
+#define BAD_SIDE(lgv, fmt...)			\
+	do { dev_err(lgv->vdev.dev, fmt); (lgv)->broken = true; } while(0)
+#define START_USE(di)
+#define END_USE(di)
+#endif
+
+/* FIXME: make the device mem layout a struct, not a set of pointers */
+struct desc_info
+{
+	/* Page of descriptors. */
+	struct lguest_desc *desc;
+	/* How we tell other side what buffers are available. */
+	unsigned int *avail_idx;
+	unsigned int *available;
+	/* How other side tells us what's used. */
+	unsigned int *used_idx;
+	struct lguest_used *used;
+
+	/* Number of free buffers */
+	unsigned int num_free;
+	/* Head of free buffer list. */
+	unsigned int free_head;
+	/* Number we've added since last sync. */
+	unsigned int num_added;
+
+	/* Last used index we've seen. */
+	unsigned int last_used_idx;
+
+	/* Unless they told us to stop */
+	bool running;
+
+#ifdef DEBUG
+	/* They're supposed to lock for us. */
+	unsigned int in_use;
+#endif
+
+	/* Tokens for callbacks. */
+	void *data[NUM_DESCS];
+};
+
+/* FIXME: When doing this for real, vdev will go straight into lguest_device */
+struct lguest_virtio_device
+{
+	struct virtio_device vdev;
+	struct lguest_device *lg;
+	void *priv;
+
+	/* Other side has made a mess, don't try any more. */
+	bool broken;
+
+	struct desc_info in, out;
+};
+
+static inline struct lguest_virtio_device *
+vdev_to_lgv(struct virtio_device *vdev)
+{
+	return container_of(vdev, struct lguest_virtio_device, vdev);
+}
+
+static unsigned long add_buf(struct desc_info *di,
+			     const struct scatterlist *sg,
+			     unsigned int num,
+			     void *data)
+{
+	unsigned int i, head, uninitialized_var(prev);
+
+	BUG_ON(data == NULL);
+	START_USE(di);
+
+	if (di->num_free < num) {
+		pr_debug("Can't add buf len %i - avail = %i\n", num,
+			 di->num_free);
+		END_USE(di);
+		return -ENOSPC;
+	}
+
+	/* We're about to use some buffers from the free list. */
+	di->num_free -= num;
+
+	head = di->free_head;
+	for (i = di->free_head; num; i = di->desc[i].next, num--) {
+		di->desc[i].flags |= LGUEST_DESC_F_NEXT;
+		di->desc[i].pfn = page_to_pfn(sg[0].page);
+		di->desc[i].offset = sg[0].offset;
+		di->desc[i].len = sg[0].length;
+		prev = i;
+		sg++;
+	}
+	/* Last one doesn't continue. */
+	di->desc[prev].flags &= ~LGUEST_DESC_F_NEXT;
+
+	/* Update free pointer */
+	di->free_head = i;
+
+	di->data[head] = data;
+
+	/* Make sure it's all visible to other side before setting head. */
+	wmb();
+	di->desc[head].flags |= LGUEST_DESC_F_HEAD;
+
+	/* Put it in available array for advertising. */
+	di->available[(*di->avail_idx + di->num_added++) % NUM_DESCS] = head;
+
+	pr_debug("Added buffer head %i\n", head);
+	END_USE(di);
+	return head;
+}
+
+static unsigned long lguest_add_outbuf(struct virtio_device *vdev,
+				       const struct scatterlist sg[],
+				       unsigned int num,
+				       void *data)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	BUG_ON(num > NUM_DESCS);
+	BUG_ON(num == 0);
+
+	return add_buf(&lgv->out, sg, num, data);
+}
+
+static unsigned long lguest_add_inbuf(struct virtio_device *vdev,
+				      struct scatterlist sg[],
+				      unsigned int num,
+				      void *data)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	BUG_ON(num > NUM_DESCS);
+	BUG_ON(num == 0);
+
+	return add_buf(&lgv->in, sg, num, data);
+}
+
+static void lguest_sync(struct virtio_device *vdev, enum virtio_dir inout)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	if (inout & VIRTIO_IN)
+		START_USE(&lgv->in);
+	if (inout & VIRTIO_OUT)
+		START_USE(&lgv->out);
+	/* LGUEST_DESC_F_HEAD needs to be set before we say they're avail. */
+	wmb();
+
+	if (inout & VIRTIO_IN) {
+		*lgv->in.avail_idx += lgv->in.num_added;
+		lgv->in.num_added = 0;
+	}
+	if (inout & VIRTIO_OUT) {
+		*lgv->out.avail_idx += lgv->out.num_added;
+		lgv->out.num_added = 0;
+	}
+
+	/* Prod other side to tell it about changes. */
+	hcall(LHCALL_NOTIFY, lguest_devices[lgv->lg->index].pfn, 0, 0);
+	if (inout & VIRTIO_IN)
+		END_USE(&lgv->in);
+	if (inout & VIRTIO_OUT)
+		END_USE(&lgv->out);
+}
+
+static void detach_buf(struct desc_info *di, int id)
+{
+	unsigned int i;
+
+	BUG_ON(id >= NUM_DESCS);
+	BUG_ON(!(di->desc[id].flags & LGUEST_DESC_F_HEAD));
+
+	di->desc[id].flags &= ~LGUEST_DESC_F_HEAD;
+	/* Make sure other side has seen that it's detached. */
+	wmb();
+
+	/* Put back on free list: find end */
+	for (i = id; di->desc[i].flags&LGUEST_DESC_F_NEXT; i=di->desc[i].next)
+		di->num_free++;
+
+	di->desc[i].next = di->free_head;
+	di->free_head = id;
+	/* Plus final descriptor */
+	di->num_free++;
+}
+
+static void lguest_detach_outbuf(struct virtio_device *vdev, unsigned long id)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	START_USE(&lgv->out);
+	detach_buf(&lgv->out, id);
+	END_USE(&lgv->out);
+}
+
+static void lguest_detach_inbuf(struct virtio_device *vdev, unsigned long id)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	START_USE(&lgv->in);
+	detach_buf(&lgv->in, id);
+	END_USE(&lgv->in);
+}
+
+static bool more_used(struct desc_info *di)
+{
+	return di->last_used_idx != *di->used_idx;
+}
+
+static void *get_buf(struct desc_info *di, struct lguest_virtio_device *lgv,
+		     unsigned int *len)
+{
+	unsigned int id;
+
+	START_USE(di);
+
+	if (!more_used(di)) {
+		END_USE(di);
+		return NULL;
+	}
+
+	/* Don't let them make us do infinite work. */
+	if (unlikely(*di->used_idx > di->last_used_idx + NUM_DESCS)) {
+		BAD_SIDE(lgv, "Too many descriptors");
+		return NULL;
+	}
+
+	id = di->used[di->last_used_idx%NUM_DESCS].id;
+	*len = di->used[di->last_used_idx%NUM_DESCS].len;
+
+	if (unlikely(id >= NUM_DESCS)) {
+		BAD_SIDE(lgv, "id %u out of range\n", id);
+		return NULL;
+	}
+	if (unlikely(!(di->desc[id].flags & LGUEST_DESC_F_HEAD))) {
+		BAD_SIDE(lgv, "id %u is not a head!\n", id);
+		return NULL;
+	}
+
+	detach_buf(di, id);
+	di->last_used_idx++;
+	BUG_ON(!di->data[id]);
+	END_USE(di);
+	return di->data[id];
+}
+
+static void *lguest_get_outbuf(struct virtio_device *vdev, unsigned int *len)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	return get_buf(&lgv->out, lgv, len);
+}
+
+static void *lguest_get_inbuf(struct virtio_device *vdev, unsigned int *len)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	return get_buf(&lgv->in, lgv, len);
+}
+
+static bool lguest_restart_in(struct virtio_device *vdev)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	START_USE(&lgv->in);
+	BUG_ON(lgv->in.running);
+
+	if (likely(!more_used(&lgv->in)) || unlikely(lgv->broken))
+		lgv->in.running = true;
+
+	END_USE(&lgv->in);
+	return lgv->in.running;
+}
+
+static bool lguest_restart_out(struct virtio_device *vdev)
+{
+	struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+	START_USE(&lgv->out);
+	BUG_ON(lgv->out.running);
+
+	if (likely(!more_used(&lgv->in)) || unlikely(lgv->broken))
+		lgv->in.running = true;
+
+	END_USE(&lgv->out);
+	return lgv->in.running;
+}
+
+static irqreturn_t lguest_virtio_interrupt(int irq, void *_lgv)
+{
+	struct lguest_virtio_device *lgv = _lgv;
+
+	if (unlikely(lgv->broken))
+		return IRQ_HANDLED;
+
+	if (lgv->out.running && more_used(&lgv->out))
+		lgv->out.running = lgv->vdev.driver_ops->out(&lgv->vdev);
+
+	if (lgv->in.running && more_used(&lgv->in))
+		lgv->in.running = lgv->vdev.driver_ops->in(&lgv->vdev);
+
+	return IRQ_HANDLED;
+}
+
+static struct virtio_ops lguest_virtio_ops = {
+	.add_outbuf = lguest_add_outbuf,
+	.add_inbuf = lguest_add_inbuf,
+	.sync = lguest_sync,
+	.detach_outbuf = lguest_detach_outbuf,
+	.detach_inbuf = lguest_detach_inbuf,
+	.get_outbuf = lguest_get_outbuf,
+	.get_inbuf = lguest_get_inbuf,
+	.restart_in = lguest_restart_in,
+	.restart_out = lguest_restart_out,
+};
+
+static struct lguest_virtio_device *lg_new_virtio(struct lguest_device *lgdev)
+{
+	struct lguest_virtio_device *lgv;
+	void *mem;
+	unsigned int i;
+
+	lgv = kmalloc(sizeof(*lgv), GFP_KERNEL);
+	if (!lgv)
+		return NULL;
+
+	memset(lgv, 0, sizeof(*lgv));
+
+	lgdev->private = lgv;
+	lgv->lg = lgdev;
+
+	/* Device mem is input pages followed by output pages */
+	mem = lguest_map(lguest_devices[lgdev->index].pfn<<PAGE_SHIFT, 6);
+	if (!mem)
+		goto free_lgv;
+	lgv->in.desc = mem;
+	lgv->in.avail_idx = mem + PAGE_SIZE;
+	lgv->in.available = (void *)(lgv->in.avail_idx + 1);
+	lgv->in.used_idx = mem + PAGE_SIZE*2;
+	lgv->in.used = (void *)(lgv->in.used_idx + 1);
+	lgv->out.desc = mem + PAGE_SIZE*3;
+	lgv->out.avail_idx = mem + PAGE_SIZE*4;
+	lgv->out.available = (void *)(lgv->out.avail_idx + 1);
+	lgv->out.used_idx = mem + PAGE_SIZE*5;
+	lgv->out.used = (void *)(lgv->out.used_idx + 1);
+
+	lgv->in.last_used_idx = lgv->out.last_used_idx = 0;
+	lgv->in.num_added = lgv->out.num_added = 0;
+	lgv->in.running = lgv->out.running = true;
+
+	/* Put everything in free lists. */
+	lgv->in.num_free = lgv->out.num_free = NUM_DESCS;
+	for (i = 0; i < NUM_DESCS-1; i++) {
+		lgv->in.desc[i].next = i+1;
+		lgv->out.desc[i].next = i+1;
+	}
+
+	lgv->vdev.ops = &lguest_virtio_ops;
+	lgv->vdev.dev = &lgdev->dev;
+	lgv->broken = false;
+	return lgv;
+
+free_lgv:
+	kfree(lgv);
+	return NULL;;
+}
+
+static void lg_destroy_virtio(struct lguest_virtio_device *lgv)
+{
+	lguest_unmap(lgv->in.desc);
+	kfree(lgv);
+}
+
+/* It's nice to have the name for the interrupt, so we do this separately
+ * from lg_new_virtio(). */
+static int lg_setup_interrupt(struct lguest_virtio_device *lgv,
+			      const char *name)
+{
+	int irqf;
+
+	if (lguest_devices[lgv->lg->index].features&LGUEST_DEVICE_F_RANDOMNESS)
+		irqf = IRQF_SAMPLE_RANDOM;
+	else
+		irqf = 0;
+
+	return request_irq(lgdev_irq(lgv->lg), lguest_virtio_interrupt, irqf,
+			   name, lgv);
+}
+
+/* Example network driver code. */
+#include <linux/virtio_net.h>
+#include <linux/etherdevice.h>
+
+static int lguest_virtnet_probe(struct lguest_device *lgdev)
+{
+	struct lguest_virtio_device *lgv;
+	struct net_device *dev;
+	u8 mac[ETH_ALEN];
+	int err;
+
+	lgv = lg_new_virtio(lgdev);
+	if (!lgv)
+		return -ENOMEM;
+
+	random_ether_addr(mac);
+	lgv->priv = dev = virtnet_probe(&lgv->vdev, mac);
+	if (IS_ERR(lgv->priv)) {
+		err = PTR_ERR(lgv->priv);
+		goto destroy;
+	}
+	err = lg_setup_interrupt(lgv, dev->name);
+	if (err)
+		goto unprobe;
+	return 0;
+
+unprobe:
+	virtnet_remove(dev);
+destroy:
+	lg_destroy_virtio(lgv);
+	return err;
+}
+
+static struct lguest_driver lguest_virtnet_drv = {
+	.name = "lguestvirtnet",
+	.owner = THIS_MODULE,
+	.device_type = LGUEST_DEVICE_T_VIRTNET,
+	.probe = lguest_virtnet_probe,
+};
+
+static __init int lguest_virtnet_init(void)
+{
+	return register_lguest_driver(&lguest_virtnet_drv);
+}
+device_initcall(lguest_virtnet_init);
+
+/* Example block driver code. */
+#include <linux/virtio_blk.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+static int lguest_virtblk_probe(struct lguest_device *lgdev)
+{
+	struct lguest_virtio_device *lgv;
+	struct gendisk *disk;
+	unsigned long sectors;
+	int err;
+
+	lgv = lg_new_virtio(lgdev);
+	if (!lgv)
+		return -ENOMEM;
+
+	/* Page is initially used to pass capacity. */
+	sectors = *(unsigned long *)lgv->in.desc;
+	*(unsigned long *)lgv->in.desc = 0;
+
+	lgv->priv = disk = virtblk_probe(&lgv->vdev);
+	if (IS_ERR(lgv->priv)) {
+		err = PTR_ERR(lgv->priv);
+		goto destroy;
+	}
+	set_capacity(disk, sectors);
+	blk_queue_max_hw_segments(disk->queue, NUM_DESCS-1);
+
+	err = lg_setup_interrupt(lgv, disk->disk_name);
+	if (err)
+		goto unprobe;
+	add_disk(disk);
+	return 0;
+
+unprobe:
+	virtblk_remove(disk);
+destroy:
+	lg_destroy_virtio(lgv);
+	return err;
+}
+
+static struct lguest_driver lguest_virtblk_drv = {
+	.name = "lguestvirtblk",
+	.owner = THIS_MODULE,
+	.device_type = LGUEST_DEVICE_T_VIRTBLK,
+	.probe = lguest_virtblk_probe,
+};
+
+static __init int lguest_virtblk_init(void)
+{
+	return register_lguest_driver(&lguest_virtblk_drv);
+}
+device_initcall(lguest_virtblk_init);
+
+MODULE_LICENSE("GPL");
===================================================================
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -23,6 +23,9 @@
 #define LHCALL_SET_PTE		14
 #define LHCALL_SET_PMD		15
 #define LHCALL_LOAD_TLS		16
+
+/* Experimental hcalls for new I/O */
+#define LHCALL_NOTIFY	100 /* pfn */
 
 #define LG_CLOCK_MIN_DELTA	100UL
 #define LG_CLOCK_MAX_DELTA	ULONG_MAX
===================================================================
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -44,6 +44,8 @@ struct lguest_device_desc {
 #define LGUEST_DEVICE_T_CONSOLE	1
 #define LGUEST_DEVICE_T_NET	2
 #define LGUEST_DEVICE_T_BLOCK	3
+#define LGUEST_DEVICE_T_VIRTNET	8
+#define LGUEST_DEVICE_T_VIRTBLK	9
 
 	u16 features;
 #define LGUEST_NET_F_NOCSUM		0x4000 /* Don't bother checksumming */
@@ -70,4 +72,26 @@ enum lguest_req
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
 };
+
+/* This marks a buffer as being the start (and active) */
+#define LGUEST_DESC_F_HEAD	1
+/* This marks a buffer as continuing via the next field. */
+#define LGUEST_DESC_F_NEXT	2
+
+/* Virtio descriptors */
+struct lguest_desc
+{
+	unsigned long pfn;
+	unsigned long len;
+	u16 offset;
+	u16 flags;
+	/* We chain unused descriptors via this, too */
+	u32 next;
+};
+
+struct lguest_used
+{
+	unsigned int id;
+	unsigned int len;
+};
 #endif /* _ASM_LGUEST_USER */


_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linux-foundation.org/mailman/listinfo/virtualization

[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux