Nearly any operation on virtqueues require multiple reads/writes to virtqueue ring descriptors using guest physical addresses (ld*/st*_phys). These are expensive and result in phys_page_find_alloc() and qemu_get_ram_ptr() showing up at the top of profiles run under virtio net/block workloads. We can avoid much of this by mapping the ring descriptors using cpu_physical_memory_map() and referencing virtual addresses. This does a good job at dropping these high hitters down in the profile, and generally shows a minor increase in block and net throughput and latency. I'm posting this as an RFC because this isn't nearly the slam dunk performance improvement that the virtio-net TX bottom half handler had. On average, I see just shy of a 7% improvement in overall network performance. I'm having a hard time pinning down any numbers for block since the gains typically seems quite a bit smaller than my standard deviation. I'm getting tired of testing it, so I'll toss it out and see if anyone thinks this is a good idea. I have tested this with local migrations and haven't seen any issues. Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> --- Makefile.objs | 3 - Makefile.target | 5 + hw/virtio.c | 230 +++++++++++++++++++++++++++++++++++++++---------------- 3 files changed, 164 insertions(+), 74 deletions(-) diff --git a/Makefile.objs b/Makefile.objs index 4a1eaa1..f362b8f 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -145,7 +145,6 @@ user-obj-y += cutils.o cache-utils.o hw-obj-y = hw-obj-y += vl.o loader.o -hw-obj-y += virtio.o virtio-console.o hw-obj-y += fw_cfg.o pci.o pci_host.o pcie_host.o hw-obj-y += watchdog.o hw-obj-$(CONFIG_ISA_MMIO) += isa_mmio.o @@ -244,8 +243,6 @@ sound-obj-$(CONFIG_CS4231A) += cs4231a.o adlib.o fmopl.o: QEMU_CFLAGS += -DBUILD_Y8950=0 hw-obj-$(CONFIG_SOUND) += $(sound-obj-y) -hw-obj-$(CONFIG_VIRTFS) += virtio-9p-debug.o virtio-9p-local.o - ###################################################################### # libdis # NOTE: the disassembler code is only needed for debugging diff --git a/Makefile.target b/Makefile.target index 18826bb..4e0cf8d 100644 --- a/Makefile.target +++ b/Makefile.target @@ -165,11 +165,12 @@ ifdef CONFIG_SOFTMMU obj-y = arch_init.o cpus.o monitor.o machine.o gdbstub.o balloon.o # virtio has to be here due to weird dependency between PCI and virtio-net. # need to fix this properly -obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-serial-bus.o +obj-y += virtio.o virtio-console.o virtio-blk.o virtio-balloon.o +obj-y += virtio-net.o virtio-serial-bus.o obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o obj-y += vhost_net.o obj-$(CONFIG_VHOST_NET) += vhost.o -obj-$(CONFIG_VIRTFS) += virtio-9p.o +obj-$(CONFIG_VIRTFS) += virtio-9p.o virtio-9p-debug.o virtio-9p-local.o obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o diff --git a/hw/virtio.c b/hw/virtio.c index 85312b3..0dcf375 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -15,6 +15,8 @@ #include "virtio.h" #include "sysemu.h" +#include "cpu.h" +#include "exec-all.h" /* The alignment to use between consumer and producer parts of vring. * x86 pagesize again. */ @@ -61,8 +63,11 @@ typedef struct VRing { unsigned int num; target_phys_addr_t desc; + void *desc_va; target_phys_addr_t avail; + void *avail_va; target_phys_addr_t used; + void *used_va; } VRing; struct VirtQueue @@ -78,107 +83,145 @@ struct VirtQueue EventNotifier host_notifier; }; +static void *virtqueue_map(target_phys_addr_t addr, + target_phys_addr_t len, int write) +{ + void *map; + target_phys_addr_t mapped = len; + + map = cpu_physical_memory_map(addr, &mapped, write); + if (!map) { + return NULL; + } + if (mapped != len) { + cpu_physical_memory_unmap(map, mapped, write, 0); + return NULL; + } + return map; +} + /* virt queue functions */ static void virtqueue_init(VirtQueue *vq) { - target_phys_addr_t pa = vq->pa; + target_phys_addr_t size, pa = vq->pa; vq->vring.desc = pa; vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc); vq->vring.used = vring_align(vq->vring.avail + offsetof(VRingAvail, ring[vq->vring.num]), VIRTIO_PCI_VRING_ALIGN); + + size = vq->vring.num * sizeof(VRingDesc); + if (!(vq->vring.desc_va = virtqueue_map(vq->vring.desc, size, 0))) { + fprintf(stderr, "Failed to map vring.desc\n"); + exit(1); /* Ugh, wishing for -ENOMEM */ + } + size = offsetof(VRingAvail, ring[vq->vring.num]); + if (!(vq->vring.avail_va = virtqueue_map(vq->vring.avail, size, 0))) { + fprintf(stderr, "Failed to map vring.avail\n"); + exit(1); /* Ugh, wishing for -ENOMEM */ + } + size = offsetof(VRingUsed, ring[vq->vring.num]); + if (!(vq->vring.used_va = virtqueue_map(vq->vring.used, size, 1))) { + fprintf(stderr, "Failed to map vring.used\n"); + exit(1); /* Ugh, wishing for -ENOMEM */ + } } -static inline uint64_t vring_desc_addr(target_phys_addr_t desc_pa, int i) +static inline void sync_dirty_bytes(void *ptr, int bytes) { - target_phys_addr_t pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); - return ldq_phys(pa); + if (cpu_physical_memory_get_dirty_tracking()) { + ram_addr_t addr = qemu_ram_addr_from_host(ptr); + + if (!cpu_physical_memory_is_dirty(addr)) { + tb_invalidate_phys_page_range(addr, addr + bytes, 0); + cpu_physical_memory_set_dirty_flags(addr, + (0xff & ~CODE_DIRTY_FLAG)); + } + } } -static inline uint32_t vring_desc_len(target_phys_addr_t desc_pa, int i) +static inline uint64_t vring_desc_addr(void *desc_va, int i) { - target_phys_addr_t pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); - return ldl_phys(pa); + void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); + return ldq_p(ptr); } -static inline uint16_t vring_desc_flags(target_phys_addr_t desc_pa, int i) +static inline uint32_t vring_desc_len(void *desc_va, int i) { - target_phys_addr_t pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); - return lduw_phys(pa); + void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); + return ldl_p(ptr); } -static inline uint16_t vring_desc_next(target_phys_addr_t desc_pa, int i) +static inline uint16_t vring_desc_flags(void *desc_va, int i) { - target_phys_addr_t pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); - return lduw_phys(pa); + void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); + return lduw_p(ptr); +} + +static inline uint16_t vring_desc_next(void *desc_va, int i) +{ + void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); + return lduw_p(ptr); } static inline uint16_t vring_avail_flags(VirtQueue *vq) { - target_phys_addr_t pa; - pa = vq->vring.avail + offsetof(VRingAvail, flags); - return lduw_phys(pa); + void *ptr = vq->vring.avail_va + offsetof(VRingAvail, flags); + return lduw_p(ptr); } static inline uint16_t vring_avail_idx(VirtQueue *vq) { - target_phys_addr_t pa; - pa = vq->vring.avail + offsetof(VRingAvail, idx); - return lduw_phys(pa); + void *ptr = vq->vring.avail_va + offsetof(VRingAvail, idx); + return lduw_p(ptr); } static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) { - target_phys_addr_t pa; - pa = vq->vring.avail + offsetof(VRingAvail, ring[i]); - return lduw_phys(pa); + void *ptr = vq->vring.avail_va + offsetof(VRingAvail, ring[i]); + return lduw_p(ptr); } static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) { - target_phys_addr_t pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].id); - stl_phys(pa, val); + void *ptr = vq->vring.used_va + offsetof(VRingUsed, ring[i].id); + stl_p(ptr, val); + sync_dirty_bytes(ptr, 4); } static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) { - target_phys_addr_t pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].len); - stl_phys(pa, val); + void *ptr = vq->vring.used_va + offsetof(VRingUsed, ring[i].len); + stl_p(ptr, val); + sync_dirty_bytes(ptr, 4); } static uint16_t vring_used_idx(VirtQueue *vq) { - target_phys_addr_t pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - return lduw_phys(pa); + void *ptr = vq->vring.used_va + offsetof(VRingUsed, idx); + return lduw_p(ptr); } static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val) { - target_phys_addr_t pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - stw_phys(pa, vring_used_idx(vq) + val); + void *ptr = vq->vring.used_va + offsetof(VRingUsed, idx); + stw_p(ptr, vring_used_idx(vq) + val); + sync_dirty_bytes(ptr, 2); } static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) { - target_phys_addr_t pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - stw_phys(pa, lduw_phys(pa) | mask); + void *ptr = vq->vring.used_va + offsetof(VRingUsed, flags); + stw_p(ptr, lduw_p(ptr) | mask); + sync_dirty_bytes(ptr, 2); } static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) { - target_phys_addr_t pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - stw_phys(pa, lduw_phys(pa) & ~mask); + void *ptr = vq->vring.used_va + offsetof(VRingUsed, flags); + stw_p(ptr, lduw_p(ptr) & ~mask); + sync_dirty_bytes(ptr, 2); } void virtio_queue_set_notification(VirtQueue *vq, int enable) @@ -274,17 +317,17 @@ static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx) return head; } -static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa, +static unsigned virtqueue_next_desc(void *desc_va, unsigned int i, unsigned int max) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ - if (!(vring_desc_flags(desc_pa, i) & VRING_DESC_F_NEXT)) + if (!(vring_desc_flags(desc_va, i) & VRING_DESC_F_NEXT)) return max; /* Check they're not leading us off end of descriptors. */ - next = vring_desc_next(desc_pa, i); + next = vring_desc_next(desc_va, i); /* Make sure compiler knows to grab that: we don't want it changing! */ wmb(); @@ -306,16 +349,18 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes) total_bufs = in_total = out_total = 0; while (virtqueue_num_heads(vq, idx)) { unsigned int max, num_bufs, indirect = 0; - target_phys_addr_t desc_pa; + target_phys_addr_t len = 0; + void *desc_va; int i; max = vq->vring.num; num_bufs = total_bufs; i = virtqueue_get_head(vq, idx++); - desc_pa = vq->vring.desc; + desc_va = vq->vring.desc_va; - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) { + if (vring_desc_flags(desc_va, i) & VRING_DESC_F_INDIRECT) { + len = vring_desc_len(desc_va, i); + if (len % sizeof(VRingDesc)) { fprintf(stderr, "Invalid size for indirect buffer table\n"); exit(1); } @@ -328,9 +373,13 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes) /* loop over the indirect descriptor table */ indirect = 1; - max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc); + max = len / sizeof(VRingDesc); num_bufs = i = 0; - desc_pa = vring_desc_addr(desc_pa, i); + desc_va = virtqueue_map(vring_desc_addr(desc_va, i), len, 0); + if (!desc_va) { + fprintf(stderr, "Failed to map indirect virtqueue\n"); + exit(1); + } } do { @@ -340,21 +389,31 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes) exit(1); } - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { + if (vring_desc_flags(desc_va, i) & VRING_DESC_F_WRITE) { if (in_bytes > 0 && - (in_total += vring_desc_len(desc_pa, i)) >= in_bytes) + (in_total += vring_desc_len(desc_va, i)) >= in_bytes) { + if (indirect) { + cpu_physical_memory_unmap(desc_va, len, 0, 0); + } return 1; + } } else { if (out_bytes > 0 && - (out_total += vring_desc_len(desc_pa, i)) >= out_bytes) + (out_total += vring_desc_len(desc_va, i)) >= out_bytes) { + if (indirect) { + cpu_physical_memory_unmap(desc_va, len, 0, 0); + } return 1; + } } - } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); + } while ((i = virtqueue_next_desc(desc_va, i, max)) != max); - if (!indirect) + if (!indirect) { total_bufs = num_bufs; - else + } else { + cpu_physical_memory_unmap(desc_va, len, 0, 0); total_bufs++; + } } return 0; @@ -379,7 +438,9 @@ void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr, int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) { unsigned int i, head, max; - target_phys_addr_t desc_pa = vq->vring.desc; + void *desc_va = vq->vring.desc_va; + target_phys_addr_t len = 0; + int indirect = 0; if (!virtqueue_num_heads(vq, vq->last_avail_idx)) return 0; @@ -391,15 +452,21 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) i = head = virtqueue_get_head(vq, vq->last_avail_idx++); - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) { + if (vring_desc_flags(desc_va, i) & VRING_DESC_F_INDIRECT) { + len = vring_desc_len(desc_va, i); + if (len % sizeof(VRingDesc)) { fprintf(stderr, "Invalid size for indirect buffer table\n"); exit(1); } /* loop over the indirect descriptor table */ - max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc); - desc_pa = vring_desc_addr(desc_pa, i); + max = len / sizeof(VRingDesc); + desc_va = virtqueue_map(vring_desc_addr(desc_va, i), len, 0); + if (!desc_va) { + fprintf(stderr, "Failed to map indirect virtqueue\n"); + exit(1); + } + indirect = 1; i = 0; } @@ -407,22 +474,26 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) do { struct iovec *sg; - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { - elem->in_addr[elem->in_num] = vring_desc_addr(desc_pa, i); + if (vring_desc_flags(desc_va, i) & VRING_DESC_F_WRITE) { + elem->in_addr[elem->in_num] = vring_desc_addr(desc_va, i); sg = &elem->in_sg[elem->in_num++]; } else { - elem->out_addr[elem->out_num] = vring_desc_addr(desc_pa, i); + elem->out_addr[elem->out_num] = vring_desc_addr(desc_va, i); sg = &elem->out_sg[elem->out_num++]; } - sg->iov_len = vring_desc_len(desc_pa, i); + sg->iov_len = vring_desc_len(desc_va, i); /* If we've got too many, that implies a descriptor loop. */ if ((elem->in_num + elem->out_num) > max) { fprintf(stderr, "Looped descriptor"); exit(1); } - } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); + } while ((i = virtqueue_next_desc(desc_va, i, max)) != max); + + if (indirect) { + cpu_physical_memory_unmap(desc_va, len, 0, 0); + } /* Now map what we have collected */ virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); @@ -467,6 +538,27 @@ void virtio_reset(void *opaque) vdev->vq[i].vring.desc = 0; vdev->vq[i].vring.avail = 0; vdev->vq[i].vring.used = 0; + if (vdev->vq[i].vring.desc_va) { + cpu_physical_memory_unmap(vdev->vq[i].vring.desc_va, + vdev->vq[i].vring.num * + sizeof(VRingDesc), 0, 0); + vdev->vq[i].vring.desc_va = NULL; + } + if (vdev->vq[i].vring.avail_va) { + cpu_physical_memory_unmap(vdev->vq[i].vring.avail_va, + offsetof(VRingAvail, + ring[vdev->vq[i].vring.num]), + 0, 0); + vdev->vq[i].vring.avail_va = NULL; + } + if (vdev->vq[i].vring.used_va) { + cpu_physical_memory_unmap(vdev->vq[i].vring.used_va, + offsetof(VRingUsed, + ring[vdev->vq[i].vring.num]), 1, + offsetof(VRingUsed, + ring[vdev->vq[i].vring.num])); + vdev->vq[i].vring.used_va = NULL; + } vdev->vq[i].last_avail_idx = 0; vdev->vq[i].pa = 0; vdev->vq[i].vector = VIRTIO_NO_VECTOR; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html