Currently the method of dealing with an IO operation on a bus (PIO/MMIO) is to call the read or write callback for each device registered on the bus until we find a device which handles it. Since the number of devices on a bus can be significant due to ioeventfds and coalesced MMIO zones, this leads to a lot of overhead on each IO operation. Instead of registering devices, we now register ranges which points to a device. Lookup is done using an efficient bsearch instead of a linear search. Performance test was conducted by comparing exit count per second with 200 ioeventfds created on one byte and the guest is trying to access a different byte continuously (triggering usermode exits). Before the patch the guest has achieved 259k exits per second, after the patch the guest does 274k exits per second. Changes in V4: - Formatting fix. - Nicer device IO interface in PIC driver. - Don't unregister devices which may have not registered. Changes in V3: - Small fix in error path of i8259. Changes in V2: - Only register a device once. - Support for multiple devices using same range. - Added performance benchmark. - Using bsearch() and sort() found in /lib/. Cc: Avi Kivity <avi@xxxxxxxxxx> Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx> Signed-off-by: Sasha Levin <levinsasha928@xxxxxxxxx> --- arch/x86/kvm/i8254.c | 6 ++- arch/x86/kvm/i8259.c | 108 +++++++++++++++++++++++++++++++++++-------- arch/x86/kvm/irq.h | 4 +- arch/x86/kvm/x86.c | 6 ++- include/linux/kvm_host.h | 18 ++++---- virt/kvm/coalesced_mmio.c | 3 +- virt/kvm/eventfd.c | 3 +- virt/kvm/ioapic.c | 3 +- virt/kvm/kvm_main.c | 112 ++++++++++++++++++++++++++++++++++++++++----- 9 files changed, 216 insertions(+), 47 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index efad723..76e3f1c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, + KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - &pit->speaker_dev); + KVM_SPEAKER_BASE_ADDRESS, 4, + &pit->speaker_dev); if (ret < 0) goto fail_unregister; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 19fe855..6b869ce 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -459,15 +459,9 @@ static int picdev_in_range(gpa_t addr) } } -static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pic, dev); -} - -static int picdev_write(struct kvm_io_device *this, +static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -494,10 +488,9 @@ static int picdev_write(struct kvm_io_device *this, return 0; } -static int picdev_read(struct kvm_io_device *this, +static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = 0; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -525,6 +518,48 @@ static int picdev_read(struct kvm_io_device *this, return 0; } +static int picdev_master_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_master_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_slave_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_slave_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_eclr_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + +static int picdev_eclr_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + /* * callback when PIC0 irq status changed */ @@ -537,9 +572,19 @@ static void pic_irq_request(struct kvm *kvm, int level) s->output = level; } -static const struct kvm_io_device_ops picdev_ops = { - .read = picdev_read, - .write = picdev_write, +static const struct kvm_io_device_ops picdev_master_ops = { + .read = picdev_master_read, + .write = picdev_master_write, +}; + +static const struct kvm_io_device_ops picdev_slave_ops = { + .read = picdev_slave_read, + .write = picdev_slave_write, +}; + +static const struct kvm_io_device_ops picdev_eclr_ops = { + .read = picdev_eclr_read, + .write = picdev_eclr_write, }; struct kvm_pic *kvm_create_pic(struct kvm *kvm) @@ -560,16 +605,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - kvm_iodevice_init(&s->dev, &picdev_ops); + kvm_iodevice_init(&s->dev_master, &picdev_master_ops); + kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); + kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, + &s->dev_master); + if (ret < 0) + goto fail_unlock; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); + if (ret < 0) + goto fail_unreg_2; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + if (ret < 0) + goto fail_unreg_1; + mutex_unlock(&kvm->slots_lock); - if (ret < 0) { - kfree(s); - return NULL; - } return s; + +fail_unreg_1: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); + +fail_unreg_2: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); + +fail_unlock: + mutex_unlock(&kvm->slots_lock); + + kfree(s); + + return NULL; } void kvm_destroy_pic(struct kvm *kvm) @@ -577,7 +645,9 @@ void kvm_destroy_pic(struct kvm *kvm) struct kvm_pic *vpic = kvm->arch.vpic; if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 53e2d08..2086f2b 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -66,7 +66,9 @@ struct kvm_pic { struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ int output; /* intr from master PIC */ - struct kvm_io_device dev; + struct kvm_io_device dev_master; + struct kvm_io_device dev_slave; + struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[16]; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e80f0d7..4cea9cf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3561,7 +3561,11 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r) { mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev); + &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_eclr); mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ff4d406..d0e42f3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -55,16 +55,16 @@ struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -/* - * It would be nice to use something smarter than a linear search, TBD... - * Thankfully we dont expect many devices to register (famous last words :), - * so until then it will suffice. At least its abstracted so we can change - * in one place. - */ +struct kvm_io_range { + gpa_t addr; + int len; + struct kvm_io_device *dev; +}; + struct kvm_io_bus { int dev_count; #define NR_IOBUS_DEVS 300 - struct kvm_io_device *devs[NR_IOBUS_DEVS]; + struct kvm_io_range range[NR_IOBUS_DEVS]; }; enum kvm_bus { @@ -77,8 +77,8 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val); int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); -int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev); +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 2316ec1..a6ec206 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -141,7 +141,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, dev->zone = *zone; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, + zone->size, &dev->dev); if (ret < 0) goto out_free_dev; list_add_tail(&dev->list, &kvm->coalesced_zones); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 73358d2..f59c1e8 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -586,7 +586,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) kvm_iodevice_init(&p->dev, &ioeventfd_ops); - ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); + ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, + &p->dev); if (ret < 0) goto unlock_fail; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 8df1ca1..3eed61e 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -394,7 +394,8 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, + IOAPIC_MEM_LENGTH, &ioapic->dev); mutex_unlock(&kvm->slots_lock); if (ret < 0) { kvm->arch.vioapic = NULL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aefdda3..d9cfb78 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -47,6 +47,8 @@ #include <linux/srcu.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> +#include <linux/bsearch.h> #include <asm/processor.h> #include <asm/io.h> @@ -2391,24 +2393,92 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) int i; for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; + struct kvm_io_device *pos = bus->range[i].dev; kvm_iodevice_destructor(pos); } kfree(bus); } +int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +{ + const struct kvm_io_range *r1 = p1; + const struct kvm_io_range *r2 = p2; + + if (r1->addr < r2->addr) + return -1; + if (r1->addr + r1->len > r2->addr + r2->len) + return 1; + return 0; +} + +int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, + gpa_t addr, int len) +{ + if (bus->dev_count == NR_IOBUS_DEVS) + return -ENOSPC; + + bus->range[bus->dev_count++] = (struct kvm_io_range) { + .addr = addr, + .len = len, + .dev = dev, + }; + + sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), + kvm_io_bus_sort_cmp, NULL); + + return 0; +} + +int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, + gpa_t addr, int len) +{ + struct kvm_io_range *range, key; + int off; + + key = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; + + range = bsearch(&key, bus->range, bus->dev_count, + sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); + if (range == NULL) + return -ENOENT; + + off = range - bus->range; + + while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) + off--; + + return off; +} + /* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { - int i; + int idx; struct kvm_io_bus *bus; + struct kvm_io_range range; + + range = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); - for (i = 0; i < bus->dev_count; i++) - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + idx = kvm_io_bus_get_first_dev(bus, addr, len); + if (idx < 0) + return -EOPNOTSUPP; + + while (idx < bus->dev_count && + kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { + if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) return 0; + idx++; + } + return -EOPNOTSUPP; } @@ -2416,19 +2486,33 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { - int i; + int idx; struct kvm_io_bus *bus; + struct kvm_io_range range; + + range = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); - for (i = 0; i < bus->dev_count; i++) - if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + idx = kvm_io_bus_get_first_dev(bus, addr, len); + if (idx < 0) + return -EOPNOTSUPP; + + while (idx < bus->dev_count && + kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { + if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) return 0; + idx++; + } + return -EOPNOTSUPP; } /* Caller must hold slots_lock. */ -int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, struct kvm_io_device *dev) { struct kvm_io_bus *new_bus, *bus; @@ -2440,7 +2524,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (!new_bus) return -ENOMEM; memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); - new_bus->devs[new_bus->dev_count++] = dev; + kvm_io_bus_insert_dev(new_bus, dev, addr, len); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); @@ -2464,9 +2548,13 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, r = -ENOENT; for (i = 0; i < new_bus->dev_count; i++) - if (new_bus->devs[i] == dev) { + if (new_bus->range[i].dev == dev) { r = 0; - new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; + new_bus->dev_count--; + new_bus->range[i] = new_bus->range[new_bus->dev_count]; + sort(new_bus->range, new_bus->dev_count, + sizeof(struct kvm_io_range), + kvm_io_bus_sort_cmp, NULL); break; } -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html