Re: [PATCH] vfio-pci: Add KVM INTx acceleration

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Tue, 16 Oct 2012 08:39:05 +0200

On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> This makes use of the new level irqfd support enabling bypass of
> qemu userspace both on INTx injection and unmask.  This significantly
> boosts the performance of devices making use of legacy interrupts.
> 
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
> ---
> 
> My INTx routing workaround below will probably raise some eyebrows,
> but I don't feel it's worth subjecting users to core dumps if they
> want to try vfio-pci on new platforms.  INTx routing is part of some
> larger plan, but until that plan materializes we have to try to avoid
> the API unless we think there's a good chance it might be there.
> I'll accept the maintenance of updating a whitelist in the interim.
> Thanks,
> 
> Alex
> 
>  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 224 insertions(+)
> 
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 639371e..777a5f8 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
>  
>  /*
> + * PCI code refuses to make it possible to probe whether the chipset
> + * supports pci_device_route_intx_to_irq() and booby traps the call
> + * to assert if doesn't.  For us, this is just an optimization, so
> + * only enable it when we know it's present.  Unfortunately PCIBus is
> + * private, so we can't just look at the function pointer.
> + */
> +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> +{
> +#ifdef CONFIG_KVM
> +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> +    DeviceState *dev;
> +
> +    if (!kvm_irqchip_in_kernel() ||
> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> +	return false;
> +    }

Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
Also for KVM_IRQFD_FLAG_RESAMPLE.

> +
> +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> +
> +        dev = bus->parent;
> +
> +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> +            return true;
> +        }
> +    }
> +
> +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> +                 "using slow INTx mode\n");

When does this code trigger? It seems irqchip implies piix ATM -
is this just dead code?

> +#endif
> +    return false;
> +}
> +
> +static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
> +{
> +    if (!vfio_pci_bus_has_intx_route(pdev)) {
> +        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
> +    }
> +
> +    return pci_device_route_intx_to_irq(pdev, pin);
> +}
> +
> +static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
> +{
> +    return old->mode != new->mode || old->irq != new->irq;
> +}
> +

Didn't you add an API for this? It's on pci branch but I can drop
it if not needed.

> +/*
>   * Common VFIO interrupt disable
>   */
>  static void vfio_disable_irqindex(VFIODevice *vdev, int index)
> @@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
>      ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
>  }
>  
> +#ifdef CONFIG_KVM
> +static void vfio_mask_intx(VFIODevice *vdev)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> +        .index = VFIO_PCI_INTX_IRQ_INDEX,
> +        .start = 0,
> +        .count = 1,
> +    };
> +
> +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +}
> +#endif
> +
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
>   * also be a huge overhead.  We try to get the best of both worlds by
> @@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev)
>      vfio_unmask_intx(vdev);
>  }
>  
> +static void vfio_enable_intx_kvm(VFIODevice *vdev)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_irqfd irqfd = {
> +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> +        .gsi = vdev->intx.route.irq,
> +        .flags = KVM_IRQFD_FLAG_RESAMPLE,

Should not kvm ioctl handling be localized in kvm-all.c?
E.g. extend kvm_irqchip_add_irqfd_notifier in
some way? Same question for KVM_CAP_IRQFD_RESAMPLE use above ...

> +    };
> +    struct vfio_irq_set *irq_set;
> +    int ret, argsz;
> +    int32_t *pfd;
> +
> +    if (!kvm_irqchip_in_kernel() ||
> +        vdev->intx.route.mode != PCI_INTX_ENABLED ||
> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> +        return;
> +    }
> +
> +    /* Get to a known interrupt state */
> +    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
> +    vfio_mask_intx(vdev);
> +    vdev->intx.pending = false;
> +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> +
> +    /* Get an eventfd for resample/unmask */
> +    if (event_notifier_init(&vdev->intx.unmask, 0)) {
> +        error_report("vfio: Error: event_notifier_init failed eoi\n");
> +        goto fail;
> +    }
> +
> +    /* KVM triggers it, VFIO listens for it */
> +    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
> +
> +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> +        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
> +        goto fail_irqfd;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
> +    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = irqfd.resamplefd;
> +
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    g_free(irq_set);
> +    if (ret) {
> +        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
> +        goto fail_vfio;
> +    }
> +
> +    /* Let'em rip */
> +    vfio_unmask_intx(vdev);
> +
> +    vdev->intx.kvm_accel = true;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
> +            __func__, vdev->host.domain, vdev->host.bus,
> +            vdev->host.slot, vdev->host.function);
> +
> +    return;
> +
> +fail_vfio:
> +    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
> +    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
> +fail_irqfd:
> +    event_notifier_cleanup(&vdev->intx.unmask);
> +fail:
> +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> +    vfio_unmask_intx(vdev);
> +#endif
> +}
> +
> +static void vfio_disable_intx_kvm(VFIODevice *vdev)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_irqfd irqfd = {
> +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> +        .gsi = vdev->intx.route.irq,
> +        .flags = KVM_IRQFD_FLAG_DEASSIGN,
> +    };
> +
> +    if (!vdev->intx.kvm_accel) {
> +        return;
> +    }
> +
> +    /*
> +     * Get to a known state, hardware masked, QEMU ready to accept new
> +     * interrupts, QEMU IRQ de-asserted.
> +     */
> +    vfio_mask_intx(vdev);
> +    vdev->intx.pending = false;
> +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> +
> +    /* Tell KVM to stop listening for an INTx irqfd */
> +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> +        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
> +    }
> +
> +    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
> +    event_notifier_cleanup(&vdev->intx.unmask);
> +
> +    /* QEMU starts listening for interrupt events. */
> +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> +
> +    vdev->intx.kvm_accel = false;
> +
> +    /* If we've missed an event, let it re-fire through QEMU */
> +    vfio_unmask_intx(vdev);
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
> +            __func__, vdev->host.domain, vdev->host.bus,
> +            vdev->host.slot, vdev->host.function);
> +#endif
> +}
> +
> +static void vfio_update_irq(PCIDevice *pdev)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    PCIINTxRoute route;
> +
> +    if (vdev->interrupt != VFIO_INT_INTx) {
> +        return;
> +    }
> +
> +    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> +
> +    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
> +        return; /* Nothing changed */
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vdev->intx.route.irq, route.irq);
> +
> +    vfio_disable_intx_kvm(vdev);
> +
> +    vdev->intx.route = route;
> +
> +    if (route.mode != PCI_INTX_ENABLED) {
> +        return;
> +    }
> +
> +    vfio_enable_intx_kvm(vdev);
> +
> +    /* Re-enable the interrupt in cased we missed an EOI */
> +    vfio_eoi(vdev);
> +}
> +
>  static int vfio_enable_intx(VFIODevice *vdev)
>  {
>      uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> @@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev)
>      vfio_disable_interrupts(vdev);
>  
>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> +    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
> +                                                         vdev->intx.pin);
> +
>      ret = event_notifier_init(&vdev->intx.interrupt, 0);
>      if (ret) {
>          error_report("vfio: Error: event_notifier_init failed\n");
> @@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
>          return -errno;
>      }
>  
> +    vfio_enable_intx_kvm(vdev);
> +
>      vdev->interrupt = VFIO_INT_INTx;
>  
>      DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> @@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
>      int fd;
>  
>      qemu_del_timer(vdev->intx.mmap_timer);
> +    vfio_disable_intx_kvm(vdev);
>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>      vdev->intx.pending = false;
>      qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> @@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
>          vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
>                                                    vfio_intx_mmap_enable, vdev);
> +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
>          ret = vfio_enable_intx(vdev);
>          if (ret) {
>              goto out_teardown;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html