This commit adds an I/O scheduler that sorts requests and merges adjacent requests if they have the same operation type (read/write). The code is ugly and not very well factored but it does merge successfully. --- hw/dataplane/ioq.h | 3 +- hw/dataplane/iosched.h | 51 +++++++++++++++++--------- hw/dataplane/vring.h | 4 +-- hw/virtio-blk.c | 93 +++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 122 insertions(+), 29 deletions(-) diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h index d1545d6..72e5fd6 100644 --- a/hw/dataplane/ioq.h +++ b/hw/dataplane/ioq.h @@ -96,7 +96,7 @@ static int ioq_submit(IOQueue *ioq) int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue); if (unlikely(rc < 0)) { unsigned int i; - fprintf(stderr, "io_submit io_ctx=%#lx nr=%d iovecs=%p\n", (uint64_t)ioq->io_ctx, ioq->queue_idx, ioq->queue); + fprintf(stderr, "io_submit failed io_ctx=%#lx nr=%d iovecs=%p rc=%d\n", (uint64_t)ioq->io_ctx, ioq->queue_idx, ioq->queue, rc); for (i = 0; i < ioq->queue_idx; i++) { fprintf(stderr, "[%u] type=%#x fd=%d\n", i, ioq->queue[i]->aio_lio_opcode, ioq->queue[i]->aio_fildes); } @@ -121,7 +121,6 @@ static int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion, void ssize_t ret = ((uint64_t)events[i].res2 << 32) | events[i].res; completion(events[i].obj, ret, opaque); - ioq_put_iocb(ioq, events[i].obj); } return nevents; } diff --git a/hw/dataplane/iosched.h b/hw/dataplane/iosched.h index 12ebccc..39da73c 100644 --- a/hw/dataplane/iosched.h +++ b/hw/dataplane/iosched.h @@ -9,6 +9,8 @@ typedef struct { unsigned long sched_calls; } IOSched; +typedef void MergeFunc(struct iocb *a, struct iocb *b); + static int iocb_cmp(const void *a, const void *b) { const struct iocb *iocb_a = a; @@ -29,10 +31,10 @@ static int iocb_cmp(const void *a, const void *b) static size_t iocb_nbytes(struct iocb *iocb) { - struct iovec *iov = iocb->u.c.buf; + const struct iovec *iov = iocb->u.v.vec; size_t nbytes = 0; size_t i; - for (i = 0; i < iocb->u.c.nbytes; i++) { + for (i = 0; i < iocb->u.v.nr; i++) { nbytes += iov->iov_len; iov++; } @@ -44,35 +46,52 @@ static void iosched_init(IOSched *iosched) memset(iosched, 0, sizeof *iosched); } -static void iosched_print_stats(IOSched *iosched) +static __attribute__((unused)) void iosched_print_stats(IOSched *iosched) { fprintf(stderr, "iocbs = %lu merges = %lu sched_calls = %lu\n", iosched->iocbs, iosched->merges, iosched->sched_calls); memset(iosched, 0, sizeof *iosched); } -static void iosched(IOSched *iosched, struct iocb *unsorted[], unsigned int count) +static void iosched(IOSched *iosched, struct iocb *unsorted[], unsigned int *count, MergeFunc merge_func) { - struct iocb *sorted[count]; - struct iocb *last; - unsigned int i; + struct iocb *sorted[*count]; + unsigned int merges = 0; + unsigned int i, j; + /* if ((++iosched->sched_calls % 1000) == 0) { iosched_print_stats(iosched); } + */ + + if (!*count) { + return; + } memcpy(sorted, unsorted, sizeof sorted); - qsort(sorted, count, sizeof sorted[0], iocb_cmp); - - iosched->iocbs += count; - last = sorted[0]; - for (i = 1; i < count; i++) { - if (last->aio_lio_opcode == sorted[i]->aio_lio_opcode && - last->u.c.offset + iocb_nbytes(last) == sorted[i]->u.c.offset) { - iosched->merges++; + qsort(sorted, *count, sizeof sorted[0], iocb_cmp); + + unsorted[0] = sorted[0]; + j = 1; + for (i = 1; i < *count; i++) { + struct iocb *last = sorted[i - 1]; + struct iocb *cur = sorted[i]; + + if (last->aio_lio_opcode == cur->aio_lio_opcode && + last->u.c.offset + iocb_nbytes(last) == cur->u.c.offset) { + merge_func(last, cur); + merges++; + + unsorted[j - 1] = cur; + } else { + unsorted[j++] = cur; } - last = sorted[i]; } + + iosched->merges += merges; + iosched->iocbs += *count; + *count = j; } #endif /* IOSCHED_H */ diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h index cdd4d4a..bbf8c86 100644 --- a/hw/dataplane/vring.h +++ b/hw/dataplane/vring.h @@ -29,7 +29,7 @@ static inline void *phys_to_host(Vring *vring, target_phys_addr_t phys) if (phys >= 0x100000000) { phys -= 0x100000000 - 0xe0000000; } else if (phys >= 0xe0000000) { - fprintf(stderr, "phys_to_host bad physical address in PCI range %#lx\n", phys); + fprintf(stderr, "phys_to_host bad physical address in PCI range %#lx\n", (unsigned long)phys); exit(1); } return vring->phys_mem_zero_host_ptr + phys; @@ -65,7 +65,7 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, int n) vring->last_used_idx = 0; fprintf(stderr, "vring physical=%#lx desc=%p avail=%p used=%p\n", - virtio_queue_get_ring_addr(vdev, n), + (unsigned long)virtio_queue_get_ring_addr(vdev, n), vring->vr.desc, vring->vr.avail, vring->vr.used); } diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index 75cb0f2..9131a7a 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -33,11 +33,29 @@ enum { * VRING_MAX with indirect descriptors */ }; -typedef struct { +/** I/O request + * + * Most I/O requests need to know the vring index (head) and the completion + * status byte that must be filled in to tell the guest whether or not the + * request succeeded. + * + * The iovec array pointed to by the iocb is valid only before ioq_submit() is + * called. After that, neither the kernel nor userspace needs to access the + * iovec anymore and the memory is no longer owned by this VirtIOBlockRequest. + * + * Requests can be merged together by the I/O scheduler. When this happens, + * the next_merged field is used to link the requests and only the first + * request's iocb is used. Merged requests require memory allocation for the + * iovec array and must be freed appropriately. + */ +typedef struct VirtIOBlockRequest VirtIOBlockRequest; +struct VirtIOBlockRequest { struct iocb iocb; /* Linux AIO control block */ unsigned char *status; /* virtio block status code */ unsigned int head; /* vring descriptor index */ -} VirtIOBlockRequest; + int len; /* number of I/O bytes, only used for merged reqs */ + VirtIOBlockRequest *next_merged;/* next merged iocb or NULL */ +}; typedef struct { VirtIODevice vdev; @@ -93,15 +111,17 @@ static void virtio_blk_notify_guest(VirtIOBlock *s) event_notifier_set(virtio_queue_get_guest_notifier(s->vq)); } -static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) +static void complete_one_request(VirtIOBlockRequest *req, VirtIOBlock *s, ssize_t ret) { - VirtIOBlock *s = opaque; - VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); int len; if (likely(ret >= 0)) { *req->status = VIRTIO_BLK_S_OK; - len = ret; + + /* Merged requests know their part of the length, single requests can + * just use the return value. + */ + len = unlikely(req->len) ? req->len : ret; } else { *req->status = VIRTIO_BLK_S_IOERR; len = 0; @@ -114,6 +134,59 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) vring_push(&s->vring, req->head, len + sizeof req->status); } +static bool is_request_merged(VirtIOBlockRequest *req) +{ + return req->next_merged; +} + +static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) +{ + VirtIOBlock *s = opaque; + VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); + + /* Free the iovec now, it isn't needed */ + if (unlikely(is_request_merged(req))) { + g_free((void*)iocb->u.v.vec); + } + + while (req) { + complete_one_request(req, s, ret); + + VirtIOBlockRequest *next = req->next_merged; + ioq_put_iocb(&s->ioqueue, &req->iocb); + req = next; + } +} + +static void merge_request(struct iocb *iocb_a, struct iocb *iocb_b) +{ + /* Repeated merging could be made more efficient using realloc, but this + * approach keeps it simple. */ + + VirtIOBlockRequest *req_a = container_of(iocb_a, VirtIOBlockRequest, iocb); + VirtIOBlockRequest *req_b = container_of(iocb_b, VirtIOBlockRequest, iocb); + struct iovec *iovec = g_malloc((iocb_a->u.v.nr + iocb_b->u.v.nr) * sizeof iovec[0]); + + memcpy(iovec, iocb_a->u.v.vec, iocb_a->u.v.nr * sizeof iovec[0]); + memcpy(iovec + iocb_a->u.v.nr, iocb_b->u.v.vec, iocb_b->u.v.nr * sizeof iovec[0]); + + if (is_request_merged(req_a)) { + /* Free the old merged iovec */ + g_free((void*)iocb_a->u.v.vec); + } else { + /* Stash the request length */ + req_a->len = iocb_nbytes(iocb_a); + } + + iocb_b->u.v.vec = iovec; + req_b->len = iocb_nbytes(iocb_b); + req_b->next_merged = req_a; + /* + fprintf(stderr, "merged %p (%u) and %p (%u), %u iovecs in total\n", + req_a, iocb_a->u.v.nr, req_b, iocb_b->u.v.nr, iocb_a->u.v.nr + iocb_b->u.v.nr); + */ +} + static void process_request(IOQueue *ioq, struct iovec iov[], unsigned int out_num, unsigned int in_num, unsigned int head) { /* Virtio block requests look like this: */ @@ -187,6 +260,8 @@ static void process_request(IOQueue *ioq, struct iovec iov[], unsigned int out_n VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); req->head = head; req->status = &inhdr->status; + req->len = 0; + req->next_merged = NULL; } static bool handle_notify(EventHandler *handler) @@ -251,7 +326,7 @@ static bool handle_notify(EventHandler *handler) } } - iosched(&s->iosched, s->ioqueue.queue, s->ioqueue.queue_idx); + iosched(&s->iosched, s->ioqueue.queue, &s->ioqueue.queue_idx, merge_request); /* Submit requests, if any */ int rc = ioq_submit(&s->ioqueue); @@ -298,7 +373,7 @@ static void data_plane_start(VirtIOBlock *s) /* Set up guest notifier (irq) */ if (s->vdev.binding->set_guest_notifiers(s->vdev.binding_opaque, true) != 0) { - fprintf(stderr, "virtio-blk failed to set guest notifier, ensure -enable-kvm is set\n"); + fprintf(stderr, "virtio-blk failed to set guest notifier\n"); exit(1); } @@ -306,7 +381,7 @@ static void data_plane_start(VirtIOBlock *s) /* Set up virtqueue notify */ if (s->vdev.binding->set_host_notifier(s->vdev.binding_opaque, 0, true) != 0) { - fprintf(stderr, "virtio-blk failed to set host notifier\n"); + fprintf(stderr, "virtio-blk failed to set host notifier, ensure -enable-kvm is set\n"); exit(1); } event_poll_add(&s->event_poll, &s->notify_handler, -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html