Re: [Qemu-devel] [PATCH v2 3/4] vfio/quirks: ioeventfd quirk acceleration

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Alex,

On 05/01/2018 06:43 PM, Alex Williamson wrote:
> The NVIDIA BAR0 quirks virtualize the PCI config space mirrors found
> in device MMIO space.  Normally PCI config space is considered a slow
> path and further optimization is unnecessary, however NVIDIA uses a
> register here to enable the MSI interrupt to re-trigger.  Exiting to
> QEMU for this MSI-ACK handling can therefore rate limit our interrupt
> handling.  Fortunately the MSI-ACK write is easily detected since the
> quirk MemoryRegion otherwise has very few accesses, so simply looking
> for consecutive writes with the same data is sufficient, in this case
> 10 consecutive writes with the same data and size is arbitrarily
> chosen.  We configure the KVM ioeventfd with data match, so there's
> no risk of triggering for the wrong data or size, but we do risk that
> pathological driver behavior might consume all of QEMU's file
> descriptors, so we cap ourselves to 10 ioeventfds for this purpose.
> 
> In support of the above, generic ioeventfd infrastructure is added
> for vfio quirks.  This automatically initializes an ioeventfd list
> per quirk, disables and frees ioeventfds on exit, and allows
> ioeventfds marked as dynamic to be dropped on device reset.  The
> rationale for this latter feature is that useful ioeventfds may
> depend on specific driver behavior and since we necessarily place a
> cap on our use of ioeventfds, a machine reset is a reasonable point
> at which to assume a new driver and re-profile.
> 
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
> ---
>  hw/vfio/pci-quirks.c |  174 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/vfio/pci.c        |    2 +
>  hw/vfio/pci.h        |   15 ++++
>  hw/vfio/trace-events |    3 +
>  4 files changed, 192 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> index f0947cbf152f..4cedc733bc0a 100644
> --- a/hw/vfio/pci-quirks.c
> +++ b/hw/vfio/pci-quirks.c
> @@ -12,6 +12,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
> +#include "qemu/main-loop.h"
>  #include "qemu/range.h"
>  #include "qapi/error.h"
>  #include "qapi/visitor.h"
> @@ -202,6 +203,7 @@ typedef struct VFIOConfigMirrorQuirk {
>      uint32_t offset;
>      uint8_t bar;
>      MemoryRegion *mem;
> +    uint8_t data[];
>  } VFIOConfigMirrorQuirk;
>  
>  static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
> @@ -278,12 +280,98 @@ static const MemoryRegionOps vfio_ati_3c3_quirk = {
>  static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
>  {
>      VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
> +    QLIST_INIT(&quirk->ioeventfds);
>      quirk->mem = g_new0(MemoryRegion, nr_mem);
>      quirk->nr_mem = nr_mem;
>  
>      return quirk;
>  }
>  
> +static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> +{
> +    QLIST_REMOVE(ioeventfd, next);
> +    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
> +                              ioeventfd->match_data, ioeventfd->data,
> +                              &ioeventfd->e);
> +    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
> +    event_notifier_cleanup(&ioeventfd->e);
> +    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
> +                              (uint64_t)ioeventfd->addr, ioeventfd->size,
> +                              ioeventfd->data);
> +    g_free(ioeventfd);
> +}
> +
> +static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
> +{
> +    VFIOIOEventFD *ioeventfd, *tmp;
> +
> +    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
> +        if (ioeventfd->dynamic) {
> +            vfio_ioeventfd_exit(ioeventfd);
> +        }
> +    }
> +}
> +
> +static void vfio_ioeventfd_handler(void *opaque)
> +{
> +    VFIOIOEventFD *ioeventfd = opaque;
> +
> +    if (event_notifier_test_and_clear(&ioeventfd->e)) {
> +        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
> +                          ioeventfd->data, ioeventfd->size);
> +        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
> +                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
> +                                     ioeventfd->data);
> +    }
> +}
> +
> +static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
> +                                          MemoryRegion *mr, hwaddr addr,
> +                                          unsigned size, uint64_t data,
> +                                          VFIORegion *region,
> +                                          hwaddr region_addr, bool dynamic)
> +{
> +    VFIOIOEventFD *ioeventfd;
> +
> +    if (vdev->no_kvm_ioeventfd) {
> +        return NULL;
> +    }
> +
> +    ioeventfd = g_malloc0(sizeof(*ioeventfd));
> +
> +    if (event_notifier_init(&ioeventfd->e, 0)) {
> +        g_free(ioeventfd);
> +        return NULL;
> +    }
> +
> +    /*
> +     * MemoryRegion and relative offset, plus additional ioeventfd setup
> +     * parameters for configuring and later tearing down KVM ioeventfd.
> +     */
> +    ioeventfd->mr = mr;
> +    ioeventfd->addr = addr;
> +    ioeventfd->size = size;
> +    ioeventfd->data = data;
> +    ioeventfd->match_data = true;
> +    ioeventfd->dynamic = dynamic;
> +    /*
> +     * VFIORegion and relative offset for implementing the userspace
> +     * handler.  data & size fields shared for both uses.
> +     */
> +    ioeventfd->region = region;
> +    ioeventfd->region_addr = region_addr;
> +
> +    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> +                        vfio_ioeventfd_handler, NULL, ioeventfd);
> +    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr,
> +                              ioeventfd->size, ioeventfd->match_data,
> +                              ioeventfd->data, &ioeventfd->e);
> +    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
> +                              size, data);
> +
> +    return ioeventfd;
> +}
> +
>  static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
>  {
>      VFIOQuirk *quirk;
> @@ -719,6 +807,17 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
>      trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
>  }
>  
> +typedef struct LastDataSet {
> +    hwaddr addr;
> +    uint64_t data;
> +    unsigned size;
> +    int hits;
> +    int added;
> +} LastDataSet;
> +
> +#define MAX_DYN_IOEVENTFD 10
> +#define HITS_FOR_IOEVENTFD 10
> +
>  /*
>   * Finally, BAR0 itself.  We want to redirect any accesses to either
>   * 0x1800 or 0x88000 through the PCI config space access functions.
> @@ -729,6 +828,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
>      VFIOConfigMirrorQuirk *mirror = opaque;
>      VFIOPCIDevice *vdev = mirror->vdev;
>      PCIDevice *pdev = &vdev->pdev;
> +    LastDataSet *last = (LastDataSet *)&mirror->data;
>  
>      vfio_generic_quirk_mirror_write(opaque, addr, data, size);
>  
> @@ -743,6 +843,60 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
>                            addr + mirror->offset, data, size);
>          trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
>      }
> +
> +    /*
> +     * Automatically add an ioeventfd to handle any repeated write with the
> +     * same data and size above the standard PCI config space header.  This is
> +     * primarily expected to accelerate the MSI-ACK behavior, such as noted
> +     * above.  Current hardware/drivers should trigger an ioeventfd at config
> +     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
> +     *
> +     * The criteria of 10 successive hits is arbitrary but reliably adds the
> +     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
> +     * the remaining ones have a greater chance of being seen successively.
> +     * To avoid the pathological case of burning up all of QEMU's open file
> +     * handles, arbitrarily limit this algorithm from adding no more than 10
> +     * ioeventfds, print an error if we would have added an 11th, and then
> +     * stop counting.
> +     */
> +    if (!vdev->no_kvm_ioeventfd &&
> +        addr > PCI_STD_HEADER_SIZEOF && last->added < MAX_DYN_IOEVENTFD + 1) {
nit: <= MAX_DYN_IOEVENTFD?
> +        if (addr != last->addr || data != last->data || size != last->size) {
> +            last->addr = addr;
> +            last->data = data;
> +            last->size = size;
> +            last->hits = 1;
> +        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
> +            if (last->added < MAX_DYN_IOEVENTFD) {
> +                VFIOIOEventFD *ioeventfd;
> +                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
> +                                        data, &vdev->bars[mirror->bar].region,
> +                                        mirror->offset + addr, true);
> +                if (ioeventfd) {
> +                    VFIOQuirk *quirk;
> +
> +                    QLIST_FOREACH(quirk,
> +                                  &vdev->bars[mirror->bar].quirks, next) {
> +                        if (quirk->data == mirror) {
> +                            QLIST_INSERT_HEAD(&quirk->ioeventfds,
> +                                              ioeventfd, next);
> +                            break;
> +                        }
> +                    }
> +
> +                    assert(quirk != NULL); /* Check not found */
> +
> +                    last->added++;
> +                }
> +            } else {
> +                last->added++;
> +
> +                error_report("NVIDIA ioeventfd queue full for %s, unable to "
> +                             "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
> +                             "size %u", vdev->vbasedev.name, addr, data, size);
nit: warn_report?

> +            }
> +        }
> +    }
>  }
>  
>  static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
> @@ -751,6 +905,16 @@ static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
>      .endianness = DEVICE_LITTLE_ENDIAN,
>  };
>  
> +static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
> +{
> +    VFIOConfigMirrorQuirk *mirror = quirk->data;
> +    LastDataSet *last = (LastDataSet *)&mirror->data;
> +
> +    memset(last, 0, sizeof(*last));
> +
> +    vfio_drop_dynamic_eventfds(vdev, quirk);
> +}
> +
>  static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
>  {
>      VFIOQuirk *quirk;
> @@ -763,7 +927,8 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
>      }
>  
>      quirk = vfio_quirk_alloc(1);
> -    mirror = quirk->data = g_malloc0(sizeof(*mirror));
> +    quirk->reset = vfio_nvidia_bar0_quirk_reset;
> +    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
>      mirror->mem = quirk->mem;
>      mirror->vdev = vdev;
>      mirror->offset = 0x88000;
> @@ -781,7 +946,8 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
>      /* The 0x1800 offset mirror only seems to get used by legacy VGA */
>      if (vdev->vga) {
>          quirk = vfio_quirk_alloc(1);
> -        mirror = quirk->data = g_malloc0(sizeof(*mirror));
> +        quirk->reset = vfio_nvidia_bar0_quirk_reset;
> +        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
>          mirror->mem = quirk->mem;
>          mirror->vdev = vdev;
>          mirror->offset = 0x1800;
> @@ -1668,6 +1834,10 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
>      int i;
>  
>      QLIST_FOREACH(quirk, &bar->quirks, next) {
> +        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
> +            vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
> +        }
> +
>          for (i = 0; i < quirk->nr_mem; i++) {
>              memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
>          }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 65446fb42845..ba1239551115 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3175,6 +3175,8 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
>      DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
>                       no_geforce_quirks, false),
> +    DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> +                     false),
>      DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
>      DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
>      DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 594a5bd00593..dbb3aca9b3d2 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -24,9 +24,23 @@
>  
>  struct VFIOPCIDevice;
>  
> +typedef struct VFIOIOEventFD {
> +    QLIST_ENTRY(VFIOIOEventFD) next;
> +    MemoryRegion *mr;
> +    hwaddr addr;
> +    unsigned size;
> +    uint64_t data;
> +    EventNotifier e;
> +    VFIORegion *region;
> +    hwaddr region_addr;
> +    bool match_data;
> +    bool dynamic;
maybe the "dynamic" semantics may be docuemnted in the code and not only
in the commit message.

Besides
Reviewed-by: Eric Auger <eric.auger@xxxxxxxxxx>

Thanks

Eric

> +} VFIOIOEventFD;
> +
>  typedef struct VFIOQuirk {
>      QLIST_ENTRY(VFIOQuirk) next;
>      void *data;
> +    QLIST_HEAD(, VFIOIOEventFD) ioeventfds;
>      int nr_mem;
>      MemoryRegion *mem;
>      void (*reset)(struct VFIOPCIDevice *vdev, struct VFIOQuirk *quirk);
> @@ -149,6 +163,7 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_msi;
>      bool no_kvm_msix;
>      bool no_geforce_quirks;
> +    bool no_kvm_ioeventfd;
>      VFIODisplay *dpy;
>  } VFIOPCIDevice;
>  
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 20109cb7581f..f8f97d1ff90c 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -77,6 +77,9 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s"
>  vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s"
>  vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
>  vfio_quirk_ati_bonaire_reset(const char *name) "%s"
> +vfio_ioeventfd_exit(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
> +vfio_ioeventfd_handler(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d] -> 0x%"PRIx64
> +vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
>  vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [0x%03x] 0x%08x -> 0x%08x"
>  vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
>  vfio_pci_igd_opregion_enabled(const char *name) "%s"
> 
> 



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux