On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote: > This makes use of the new level irqfd support enabling bypass of > qemu userspace both on INTx injection and unmask. This significantly > boosts the performance of devices making use of legacy interrupts. > > Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> > --- > > My INTx routing workaround below will probably raise some eyebrows, > but I don't feel it's worth subjecting users to core dumps if they > want to try vfio-pci on new platforms. INTx routing is part of some > larger plan, but until that plan materializes we have to try to avoid > the API unless we think there's a good chance it might be there. > I'll accept the maintenance of updating a whitelist in the interim. > Thanks, > > Alex > > hw/vfio_pci.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 224 insertions(+) > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c > index 639371e..777a5f8 100644 > --- a/hw/vfio_pci.c > +++ b/hw/vfio_pci.c > @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); > static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled); > > /* > + * PCI code refuses to make it possible to probe whether the chipset > + * supports pci_device_route_intx_to_irq() and booby traps the call > + * to assert if doesn't. For us, this is just an optimization, so > + * only enable it when we know it's present. Unfortunately PCIBus is > + * private, so we can't just look at the function pointer. > + */ > +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev) > +{ > +#ifdef CONFIG_KVM > + BusState *bus = qdev_get_parent_bus(&pdev->qdev); > + DeviceState *dev; > + > + if (!kvm_irqchip_in_kernel() || > + !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) { > + return false; > + } Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE? Also for KVM_IRQFD_FLAG_RESAMPLE. > + > + for (; bus->parent; bus = qdev_get_parent_bus(dev)) { > + > + dev = bus->parent; > + > + if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) { > + return true; > + } > + } > + > + error_report("vfio-pci: VM chipset does not support INTx routing, " > + "using slow INTx mode\n"); When does this code trigger? It seems irqchip implies piix ATM - is this just dead code? > +#endif > + return false; > +} > + > +static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin) > +{ > + if (!vfio_pci_bus_has_intx_route(pdev)) { > + return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 }; > + } > + > + return pci_device_route_intx_to_irq(pdev, pin); > +} > + > +static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new) > +{ > + return old->mode != new->mode || old->irq != new->irq; > +} > + Didn't you add an API for this? It's on pci branch but I can drop it if not needed. > +/* > * Common VFIO interrupt disable > */ > static void vfio_disable_irqindex(VFIODevice *vdev, int index) > @@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev) > ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); > } > > +#ifdef CONFIG_KVM > +static void vfio_mask_intx(VFIODevice *vdev) > +{ > + struct vfio_irq_set irq_set = { > + .argsz = sizeof(irq_set), > + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, > + .index = VFIO_PCI_INTX_IRQ_INDEX, > + .start = 0, > + .count = 1, > + }; > + > + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); > +} > +#endif > + > /* > * Disabling BAR mmaping can be slow, but toggling it around INTx can > * also be a huge overhead. We try to get the best of both worlds by > @@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev) > vfio_unmask_intx(vdev); > } > > +static void vfio_enable_intx_kvm(VFIODevice *vdev) > +{ > +#ifdef CONFIG_KVM > + struct kvm_irqfd irqfd = { > + .fd = event_notifier_get_fd(&vdev->intx.interrupt), > + .gsi = vdev->intx.route.irq, > + .flags = KVM_IRQFD_FLAG_RESAMPLE, Should not kvm ioctl handling be localized in kvm-all.c? E.g. extend kvm_irqchip_add_irqfd_notifier in some way? Same question for KVM_CAP_IRQFD_RESAMPLE use above ... > + }; > + struct vfio_irq_set *irq_set; > + int ret, argsz; > + int32_t *pfd; > + > + if (!kvm_irqchip_in_kernel() || > + vdev->intx.route.mode != PCI_INTX_ENABLED || > + !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) { > + return; > + } > + > + /* Get to a known interrupt state */ > + qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); > + vfio_mask_intx(vdev); > + vdev->intx.pending = false; > + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); > + > + /* Get an eventfd for resample/unmask */ > + if (event_notifier_init(&vdev->intx.unmask, 0)) { > + error_report("vfio: Error: event_notifier_init failed eoi\n"); > + goto fail; > + } > + > + /* KVM triggers it, VFIO listens for it */ > + irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); > + > + if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { > + error_report("vfio: Error: Failed to setup resample irqfd: %m\n"); > + goto fail_irqfd; > + } > + > + argsz = sizeof(*irq_set) + sizeof(*pfd); > + > + irq_set = g_malloc0(argsz); > + irq_set->argsz = argsz; > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; > + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; > + irq_set->start = 0; > + irq_set->count = 1; > + pfd = (int32_t *)&irq_set->data; > + > + *pfd = irqfd.resamplefd; > + > + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); > + g_free(irq_set); > + if (ret) { > + error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n"); > + goto fail_vfio; > + } > + > + /* Let'em rip */ > + vfio_unmask_intx(vdev); > + > + vdev->intx.kvm_accel = true; > + > + DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n", > + __func__, vdev->host.domain, vdev->host.bus, > + vdev->host.slot, vdev->host.function); > + > + return; > + > +fail_vfio: > + irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; > + kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); > +fail_irqfd: > + event_notifier_cleanup(&vdev->intx.unmask); > +fail: > + qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); > + vfio_unmask_intx(vdev); > +#endif > +} > + > +static void vfio_disable_intx_kvm(VFIODevice *vdev) > +{ > +#ifdef CONFIG_KVM > + struct kvm_irqfd irqfd = { > + .fd = event_notifier_get_fd(&vdev->intx.interrupt), > + .gsi = vdev->intx.route.irq, > + .flags = KVM_IRQFD_FLAG_DEASSIGN, > + }; > + > + if (!vdev->intx.kvm_accel) { > + return; > + } > + > + /* > + * Get to a known state, hardware masked, QEMU ready to accept new > + * interrupts, QEMU IRQ de-asserted. > + */ > + vfio_mask_intx(vdev); > + vdev->intx.pending = false; > + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); > + > + /* Tell KVM to stop listening for an INTx irqfd */ > + if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { > + error_report("vfio: Error: Failed to disable INTx irqfd: %m\n"); > + } > + > + /* We only need to close the eventfd for VFIO to cleanup the kernel side */ > + event_notifier_cleanup(&vdev->intx.unmask); > + > + /* QEMU starts listening for interrupt events. */ > + qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); > + > + vdev->intx.kvm_accel = false; > + > + /* If we've missed an event, let it re-fire through QEMU */ > + vfio_unmask_intx(vdev); > + > + DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", > + __func__, vdev->host.domain, vdev->host.bus, > + vdev->host.slot, vdev->host.function); > +#endif > +} > + > +static void vfio_update_irq(PCIDevice *pdev) > +{ > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > + PCIINTxRoute route; > + > + if (vdev->interrupt != VFIO_INT_INTx) { > + return; > + } > + > + route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); > + > + if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) { > + return; /* Nothing changed */ > + } > + > + DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__, > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function, vdev->intx.route.irq, route.irq); > + > + vfio_disable_intx_kvm(vdev); > + > + vdev->intx.route = route; > + > + if (route.mode != PCI_INTX_ENABLED) { > + return; > + } > + > + vfio_enable_intx_kvm(vdev); > + > + /* Re-enable the interrupt in cased we missed an EOI */ > + vfio_eoi(vdev); > +} > + > static int vfio_enable_intx(VFIODevice *vdev) > { > uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); > @@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev) > vfio_disable_interrupts(vdev); > > vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ > + vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, > + vdev->intx.pin); > + > ret = event_notifier_init(&vdev->intx.interrupt, 0); > if (ret) { > error_report("vfio: Error: event_notifier_init failed\n"); > @@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev) > return -errno; > } > > + vfio_enable_intx_kvm(vdev); > + > vdev->interrupt = VFIO_INT_INTx; > > DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, > @@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev) > int fd; > > qemu_del_timer(vdev->intx.mmap_timer); > + vfio_disable_intx_kvm(vdev); > vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX); > vdev->intx.pending = false; > qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); > @@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev) > if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { > vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock, > vfio_intx_mmap_enable, vdev); > + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq); > ret = vfio_enable_intx(vdev); > if (ret) { > goto out_teardown; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html