Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxxxxxxxxxx> --- hw/dataplane/event-poll.h | 79 +++++++++++++++++++ hw/dataplane/vring.h | 191 +++++++++++++++++++++++++++++++++++++++++++++ hw/virtio-blk.c | 149 ++++++++--------------------------- 3 files changed, 304 insertions(+), 115 deletions(-) create mode 100644 hw/dataplane/event-poll.h create mode 100644 hw/dataplane/vring.h diff --git a/hw/dataplane/event-poll.h b/hw/dataplane/event-poll.h new file mode 100644 index 0000000..f38e969 --- /dev/null +++ b/hw/dataplane/event-poll.h @@ -0,0 +1,79 @@ +#ifndef EVENT_POLL_H +#define EVENT_POLL_H + +#include <sys/epoll.h> +#include "event_notifier.h" + +typedef struct EventHandler EventHandler; +typedef void EventCallback(EventHandler *handler); +struct EventHandler +{ + EventNotifier *notifier; /* eventfd */ + EventCallback *callback; /* callback function */ +}; + +typedef struct { + int epoll_fd; /* epoll(2) file descriptor */ +} EventPoll; + +static void event_poll_init(EventPoll *poll) +{ + /* Create epoll file descriptor */ + poll->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (poll->epoll_fd < 0) { + fprintf(stderr, "epoll_create1 failed: %m\n"); + exit(1); + } +} + +static void event_poll_cleanup(EventPoll *poll) +{ + close(poll->epoll_fd); + poll->epoll_fd = -1; +} + +/* Add an event notifier and its callback for polling */ +static void event_poll_add(EventPoll *poll, EventHandler *handler, EventNotifier *notifier, EventCallback *callback) +{ + struct epoll_event event = { + .events = EPOLLIN, + .data.ptr = handler, + }; + handler->notifier = notifier; + handler->callback = callback; + if (epoll_ctl(poll->epoll_fd, EPOLL_CTL_ADD, event_notifier_get_fd(notifier), &event) != 0) { + fprintf(stderr, "failed to add event handler to epoll: %m\n"); + exit(1); + } +} + +/* Block until the next event and invoke its callback + * + * Signals must be masked, EINTR should never happen. This is true for QEMU + * threads. + */ +static void event_poll(EventPoll *poll) +{ + EventHandler *handler; + struct epoll_event event; + int nevents; + + /* Wait for the next event. Only do one event per call to keep the + * function simple, this could be changed later. */ + nevents = epoll_wait(poll->epoll_fd, &event, 1, -1); + if (unlikely(nevents != 1)) { + fprintf(stderr, "epoll_wait failed: %m\n"); + exit(1); /* should never happen */ + } + + /* Find out which event handler has become active */ + handler = event.data.ptr; + + /* Clear the eventfd */ + event_notifier_test_and_clear(handler->notifier); + + /* Handle the event */ + handler->callback(handler); +} + +#endif /* EVENT_POLL_H */ diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h new file mode 100644 index 0000000..7099a99 --- /dev/null +++ b/hw/dataplane/vring.h @@ -0,0 +1,191 @@ +#ifndef VRING_H +#define VRING_H + +#include <linux/virtio_ring.h> +#include "qemu-common.h" + +typedef struct { + void *phys_mem_zero_host_ptr; /* host pointer to guest RAM */ + struct vring vr; /* virtqueue vring mapped to host memory */ + __u16 last_avail_idx; /* last processed avail ring index */ + __u16 last_used_idx; /* last processed used ring index */ +} Vring; + +static inline unsigned int vring_get_num(Vring *vring) +{ + return vring->vr.num; +} + +/* Map target physical address to host address + */ +static inline void *phys_to_host(Vring *vring, target_phys_addr_t phys) +{ + /* Adjust for 3.6-4 GB PCI memory range */ + if (phys >= 0x100000000) { + phys -= 0x100000000 - 0xe0000000; + } else if (phys >= 0xe0000000) { + fprintf(stderr, "phys_to_host bad physical address in PCI range %#lx\n", phys); + exit(1); + } + return vring->phys_mem_zero_host_ptr + phys; +} + +/* Setup for cheap target physical to host address conversion + * + * This is a hack for direct access to guest memory, we're not really allowed + * to do this. + */ +static void setup_phys_to_host(Vring *vring) +{ + target_phys_addr_t len = 4096; /* RAM is really much larger but we cheat */ + vring->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0); + if (!vring->phys_mem_zero_host_ptr) { + fprintf(stderr, "setup_phys_to_host failed\n"); + exit(1); + } +} + +/* Map the guest's vring to host memory + * + * This is not allowed but we know the ring won't move. + */ +static void vring_setup(Vring *vring, VirtIODevice *vdev, int n) +{ + setup_phys_to_host(vring); + + vring_init(&vring->vr, virtio_queue_get_num(vdev, n), + phys_to_host(vring, virtio_queue_get_ring_addr(vdev, n)), 4096); + + vring->last_avail_idx = vring->vr.avail->idx; + vring->last_used_idx = vring->vr.used->idx; + + fprintf(stderr, "vring physical=%#lx desc=%p avail=%p used=%p\n", + virtio_queue_get_ring_addr(vdev, n), + vring->vr.desc, vring->vr.avail, vring->vr.used); +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which is + * never a valid descriptor number) if none was found. A negative code is + * returned on error. + * + * Stolen from linux-2.6/drivers/vhost/vhost.c. + */ +static unsigned int vring_pop(Vring *vring, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num) +{ + struct vring_desc desc; + unsigned int i, head, found = 0, num = vring->vr.num; + __u16 avail_idx, last_avail_idx; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vring->last_avail_idx; + avail_idx = vring->vr.avail->idx; + + if (unlikely((__u16)(avail_idx - last_avail_idx) > num)) { + fprintf(stderr, "Guest moved used index from %u to %u\n", + last_avail_idx, avail_idx); + exit(1); + } + + /* If there's nothing new since last we looked, return invalid. */ + if (avail_idx == last_avail_idx) + return num; + + /* Only get avail ring entries after they have been exposed by guest. */ + __sync_synchronize(); /* smp_rmb() */ + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = vring->vr.avail->ring[last_avail_idx % num]; + + /* If their number is silly, that's an error. */ + if (unlikely(head >= num)) { + fprintf(stderr, "Guest says index %u > %u is available\n", + head, num); + exit(1); + } + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + if (unlikely(i >= num)) { + fprintf(stderr, "Desc index is %u > %u, head = %u\n", + i, num, head); + exit(1); + } + if (unlikely(++found > num)) { + fprintf(stderr, "Loop detected: last one at %u " + "vq size %u head %u\n", + i, num, head); + exit(1); + } + desc = vring->vr.desc[i]; + if (desc.flags & VRING_DESC_F_INDIRECT) { +/* ret = get_indirect(dev, vq, iov, iov_size, + out_num, in_num, + log, log_num, &desc); + if (unlikely(ret < 0)) { + vq_err(vq, "Failure detected " + "in indirect descriptor at idx %d\n", i); + return ret; + } + continue; */ + fprintf(stderr, "virtio-blk indirect vring not supported\n"); + exit(1); + } + + iov->iov_base = phys_to_host(vring, desc.addr); + iov->iov_len = desc.len; + iov++; + + if (desc.flags & VRING_DESC_F_WRITE) { + /* If this is an input descriptor, + * increment that count. */ + *in_num += 1; + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(*in_num)) { + fprintf(stderr, "Descriptor has out after in: " + "idx %d\n", i); + exit(1); + } + *out_num += 1; + } + i = desc.next; + } while (desc.flags & VRING_DESC_F_NEXT); + + /* On success, increment avail index. */ + vring->last_avail_idx++; + return head; +} + +/* After we've used one of their buffers, we tell them about it. + * + * Stolen from linux-2.6/drivers/vhost/vhost.c. + */ +static __attribute__((unused)) void vring_push(Vring *vring, unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num]; + used->id = head; + used->len = len; + + /* Make sure buffer is written before we update index. */ + __sync_synchronize(); /* smp_wmb() */ + + vring->vr.used->idx = ++vring->last_used_idx; +} + +#endif /* VRING_H */ diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index 99654f1..2c1cce8 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -11,26 +11,21 @@ * */ -#include <sys/epoll.h> -#include <sys/eventfd.h> #include <libaio.h> -#include <linux/virtio_ring.h> #include "qemu-common.h" #include "qemu-thread.h" #include "qemu-error.h" #include "blockdev.h" #include "virtio-blk.h" +#include "hw/dataplane/event-poll.h" +#include "hw/dataplane/vring.h" +#include "kvm.h" enum { - SEG_MAX = 126, /* maximum number of I/O segments */ + SEG_MAX = 126, /* maximum number of I/O segments */ + VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ }; -typedef struct -{ - EventNotifier *notifier; /* eventfd */ - void (*handler)(void); /* handler function */ -} EventHandler; - typedef struct VirtIOBlock { VirtIODevice vdev; @@ -44,15 +39,13 @@ typedef struct VirtIOBlock bool data_plane_started; QemuThread data_plane_thread; - struct vring vring; + Vring vring; /* virtqueue vring */ - int epoll_fd; /* epoll(2) file descriptor */ + EventPoll event_poll; /* event poller */ io_context_t io_ctx; /* Linux AIO context */ EventNotifier io_notifier; /* Linux AIO eventfd */ EventHandler io_handler; /* Linux AIO completion handler */ EventHandler notify_handler; /* virtqueue notify handler */ - - void *phys_mem_zero_host_ptr; /* host pointer to guest RAM */ } VirtIOBlock; static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev) @@ -60,138 +53,64 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev) return (VirtIOBlock *)vdev; } -/* Map target physical address to host address - */ -static inline void *phys_to_host(VirtIOBlock *s, target_phys_addr_t phys) +static void handle_io(EventHandler *handler) { - /* Adjust for 3.6-4 GB PCI memory range */ - if (phys >= 0x100000000) { - phys -= 0x100000000 - 0xe0000000; - } else if (phys >= 0xe0000000) { - fprintf(stderr, "phys_to_host bad physical address in PCI range %#lx\n", phys); - exit(1); - } - return s->phys_mem_zero_host_ptr + phys; + fprintf(stderr, "io completion happened\n"); } -/* Setup for cheap target physical to host address conversion - * - * This is a hack for direct access to guest memory, we're not really allowed - * to do this. - */ -static void setup_phys_to_host(VirtIOBlock *s) +static void handle_notify(EventHandler *handler) { - target_phys_addr_t len = 4096; /* RAM is really much larger but we cheat */ - s->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0); - if (!s->phys_mem_zero_host_ptr) { - fprintf(stderr, "setup_phys_to_host failed\n"); - exit(1); + VirtIOBlock *s = container_of(handler, VirtIOBlock, notify_handler); + struct iovec iov[VRING_MAX]; + unsigned int out_num, in_num; + int head; + + head = vring_pop(&s->vring, iov, ARRAY_SIZE(iov), &out_num, &in_num); + if (unlikely(head >= vring_get_num(&s->vring))) { + fprintf(stderr, "false alarm, nothing on vring\n"); + return; } -} -/* Map the guest's vring to host memory - * - * This is not allowed but we know the ring won't move. - */ -static void map_vring(struct vring *vring, VirtIOBlock *s, VirtIODevice *vdev, int n) -{ - vring->num = virtio_queue_get_num(vdev, n); - vring->desc = phys_to_host(s, virtio_queue_get_desc_addr(vdev, n)); - vring->avail = phys_to_host(s, virtio_queue_get_avail_addr(vdev, n)); - vring->used = phys_to_host(s, virtio_queue_get_used_addr(vdev, n)); - - fprintf(stderr, "virtio-blk vring physical=%#lx desc=%p avail=%p used=%p\n", - virtio_queue_get_ring_addr(vdev, n), - vring->desc, vring->avail, vring->used); -} - -static void handle_io(void) -{ - fprintf(stderr, "io completion happened\n"); -} - -static void handle_notify(void) -{ - fprintf(stderr, "virtqueue notify happened\n"); + fprintf(stderr, "head=%u out_num=%u in_num=%u\n", head, out_num, in_num); } static void *data_plane_thread(void *opaque) { VirtIOBlock *s = opaque; - struct epoll_event event; - int nevents; - EventHandler *event_handler; - - /* Signals are masked, EINTR should never happen */ for (;;) { - /* Wait for the next event. Only do one event per call to keep the - * function simple, this could be changed later. */ - nevents = epoll_wait(s->epoll_fd, &event, 1, -1); - if (unlikely(nevents != 1)) { - fprintf(stderr, "epoll_wait failed: %m\n"); - continue; /* should never happen */ - } - - /* Find out which event handler has become active */ - event_handler = event.data.ptr; - - /* Clear the eventfd */ - event_notifier_test_and_clear(event_handler->notifier); - - /* Handle the event */ - event_handler->handler(); + event_poll(&s->event_poll); } return NULL; } -static void add_event_handler(int epoll_fd, EventHandler *event_handler) -{ - struct epoll_event event = { - .events = EPOLLIN, - .data.ptr = event_handler, - }; - if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event_notifier_get_fd(event_handler->notifier), &event) != 0) { - fprintf(stderr, "virtio-blk failed to add event handler to epoll: %m\n"); - exit(1); - } -} - static void data_plane_start(VirtIOBlock *s) { - setup_phys_to_host(s); - map_vring(&s->vring, s, &s->vdev, 0); - - /* Create epoll file descriptor */ - s->epoll_fd = epoll_create1(EPOLL_CLOEXEC); - if (s->epoll_fd < 0) { - fprintf(stderr, "epoll_create1 failed: %m\n"); - return; /* TODO error handling */ - } + vring_setup(&s->vring, &s->vdev, 0); + + event_poll_init(&s->event_poll); if (s->vdev.binding->set_host_notifier(s->vdev.binding_opaque, 0, true) != 0) { - fprintf(stderr, "virtio-blk failed to set host notifier\n"); - return; /* TODO error handling */ + fprintf(stderr, "virtio-blk failed to set host notifier, ensure -enable-kvm is set\n"); + exit(1); } - s->notify_handler.notifier = virtio_queue_get_host_notifier(s->vq), - s->notify_handler.handler = handle_notify; - add_event_handler(s->epoll_fd, &s->notify_handler); + event_poll_add(&s->event_poll, &s->notify_handler, + virtio_queue_get_host_notifier(s->vq), + handle_notify); /* Create aio context */ if (io_setup(SEG_MAX, &s->io_ctx) != 0) { fprintf(stderr, "virtio-blk io_setup failed\n"); - return; /* TODO error handling */ + exit(1); } if (event_notifier_init(&s->io_notifier, 0) != 0) { fprintf(stderr, "virtio-blk io event notifier creation failed\n"); - return; /* TODO error handling */ + exit(1); } - s->io_handler.notifier = &s->io_notifier; - s->io_handler.handler = handle_io; - add_event_handler(s->epoll_fd, &s->io_handler); + event_poll_add(&s->event_poll, &s->io_handler, &s->io_notifier, handle_io); qemu_thread_create(&s->data_plane_thread, data_plane_thread, s, QEMU_THREAD_JOINABLE); @@ -209,7 +128,7 @@ static void data_plane_stop(VirtIOBlock *s) s->vdev.binding->set_host_notifier(s->vdev.binding_opaque, 0, false); - close(s->epoll_fd); + event_poll_cleanup(&s->event_poll); } static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val) @@ -317,7 +236,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf, s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1; bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs); - s->vq = virtio_add_queue(&s->vdev, SEG_MAX + 2, virtio_blk_handle_output); + s->vq = virtio_add_queue(&s->vdev, VRING_MAX, virtio_blk_handle_output); s->data_plane_started = false; s->qdev = dev; -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html