Re: [Qemu-devel] [RFC v2 3/6] vfio: add vfio-platform support

Alex Williamson <alex.williamson@xxxxxxxxxx> · Fri, 18 Apr 2014 08:54:21 -0600

On Wed, 2014-04-09 at 16:33 +0100, Eric Auger wrote:
> From: Kim Phillips <kim.phillips@xxxxxxxxxx>
> 
> Functions for which PCI and platform device support share are moved
> into common.c.  The common vfio_{get,put}_group() get an additional
> argument, a pointer to a vfio_reset_handler(), for which to pass on to
> qemu_register_reset, but only if it exists (the platform device code
> currently passes a NULL as its reset_handler).
> 
> For the platform device code, we basically use SysBusDevice
> instead of PCIDevice.  Since realize() returns void, unlike
> PCIDevice's initfn, error codes are moved into the
> error message text with %m.
> 
> Currently only MMIO access is supported at this time.
> 
> The perceived path for future QEMU development is:
> 
> - add support for interrupts
> - verify and test platform dev unmap path
> - test existing PCI path for any regressions
> - add support for creating platform devices on the qemu command line
>   - currently device address specification is hardcoded for test
>     development on Calxeda Midway's fff51000.ethernet device
> - reset is not supported and registration of reset functions is
>   bypassed for platform devices.
>   - there is no standard means of resetting a platform device,
>     unsure if it suffices to be handled at device--VFIO binding time
> 
> Signed-off-by: Kim Phillips <kim.phillips@xxxxxxxxxx>
> 
> [1] http://www.spinics.net/lists/kvm-arm/msg08195.html
> ---
>  hw/vfio/Makefile.objs |   2 +
>  hw/vfio/common.c      | 486 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.c         | 480 ++-----------------------------------------------
>  hw/vfio/platform.c    | 381 +++++++++++++++++++++++++++++++++++++++
>  hw/vfio/vfio-common.h |  55 ++++++
>  5 files changed, 937 insertions(+), 467 deletions(-)
>  create mode 100644 hw/vfio/common.c
>  create mode 100644 hw/vfio/platform.c
>  create mode 100644 hw/vfio/vfio-common.h
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index 31c7dab..c5c76fe 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,3 +1,5 @@
>  ifeq ($(CONFIG_LINUX), y)
> +obj-$(CONFIG_SOFTMMU) += common.o
>  obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_SOFTMMU) += platform.o
>  endif
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> new file mode 100644
> index 0000000..9d1f723
> --- /dev/null
> +++ b/hw/vfio/common.c
> @@ -0,0 +1,486 @@
> +/*
> + * vfio based device assignment support
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@xxxxxxxxxx>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@xxxxxxxxxxxx)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@xxxxxxxxxxxx)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@xxxxxxxxxxxx)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@xxxxxxxxxx)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@xxxxxxxxxx)
> + */
> +
> +#include <dirent.h>
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +
> +#include "config.h"
> +#include "exec/address-spaces.h"
> +#include "exec/memory.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +#include "hw/pci/pci.h"

I expect these pci includes aren't really needed

> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/event_notifier.h"
> +#include "qemu/queue.h"
> +#include "qemu/range.h"
> +#include "sysemu/kvm.h"
> +#include "sysemu/sysemu.h"
> +
> +#include "vfio-common.h"
> +
> +#define DEBUG_VFIO
> +#ifdef DEBUG_VFIO
> +#define DPRINTF(fmt, ...) \
> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif

DEBUG_VFIO should probably be in vfio-common.h

> +
> +static QLIST_HEAD(, VFIOContainer)
> +    container_list = QLIST_HEAD_INITIALIZER(container_list);
> +
> +QLIST_HEAD(, VFIOGroup)
> +    group_list = QLIST_HEAD_INITIALIZER(group_list);
> +
> +
> +struct VFIODevice;

I don't see where this is needed.

> +
> +#ifdef CONFIG_KVM
> +/*
> + * We have a single VFIO pseudo device per KVM VM.  Once created it lives
> + * for the life of the VM.  Closing the file descriptor only drops our
> + * reference to it and the device's reference to kvm.  Therefore once
> + * initialized, this file descriptor is only released on QEMU exit and
> + * we'll re-use it should another vfio device be attached before then.
> + */
> +static int vfio_kvm_device_fd = -1;
> +#endif
> +
> +/*
> + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> + */
> +static int vfio_dma_unmap(VFIOContainer *container,
> +                          hwaddr iova, ram_addr_t size)
> +{
> +    struct vfio_iommu_type1_dma_unmap unmap = {
> +        .argsz = sizeof(unmap),
> +        .flags = 0,
> +        .iova = iova,
> +        .size = size,
> +    };
> +
> +    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> +        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> +                        ram_addr_t size, void *vaddr, bool readonly)
> +{
> +    struct vfio_iommu_type1_dma_map map = {
> +        .argsz = sizeof(map),
> +        .flags = VFIO_DMA_MAP_FLAG_READ,
> +        .vaddr = (__u64)(uintptr_t)vaddr,
> +        .iova = iova,
> +        .size = size,
> +    };
> +
> +    if (!readonly) {
> +        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
> +    }
> +
> +    /*
> +     * Try the mapping, if it fails with EBUSY, unmap the region and try
> +     * again.  This shouldn't be necessary, but we sometimes see it in
> +     * the the VGA ROM space.
> +     */
> +    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> +        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> +         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
> +        return 0;
> +    }
> +
> +    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
> +    return -errno;
> +}
> +
> +static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> +{
> +    return !memory_region_is_ram(section->mr) ||
> +           /*
> +            * Sizing an enabled 64-bit BAR can cause spurious mappings to
> +            * addresses in the upper part of the 64-bit address space.  These
> +            * are never accessed by the CPU and beyond the address width of
> +            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
> +            */
> +           section->offset_within_address_space & (1ULL << 63);
> +}
> +
> +static void vfio_listener_region_add(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.type1.listener);
> +    hwaddr iova, end;
> +    void *vaddr;
> +    int ret;
> +
> +    assert(!memory_region_is_iommu(section->mr));
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
> +                section->offset_within_address_space,
> +                section->offset_within_address_space +
> +                int128_get64(int128_sub(section->size, int128_one())));
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region", __func__);
> +        return;
> +    }
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + int128_get64(section->size)) &
> +          TARGET_PAGE_MASK;
> +
> +    if (iova >= end) {
> +        return;
> +    }
> +
> +    vaddr = memory_region_get_ram_ptr(section->mr) +
> +            section->offset_within_region +
> +            (iova - section->offset_within_address_space);
> +
> +    DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
> +            iova, end - 1, vaddr);
> +
> +    memory_region_ref(section->mr);
> +    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
> +    if (ret) {
> +        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> +                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
> +                     container, iova, end - iova, vaddr, ret);
> +
> +        /*
> +         * On the initfn path, store the first error in the container so we
> +         * can gracefully fail.  Runtime, there's not much we can do other
> +         * than throw a hardware error.
> +         */
> +        if (!container->iommu_data.type1.initialized) {
> +            if (!container->iommu_data.type1.error) {
> +                container->iommu_data.type1.error = ret;
> +            }
> +        } else {
> +            hw_error("vfio: DMA mapping failed, unable to continue");
> +        }
> +    }
> +}
> +
> +static void vfio_listener_region_del(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.type1.listener);
> +    hwaddr iova, end;
> +    int ret;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
> +                section->offset_within_address_space,
> +                section->offset_within_address_space +
> +                int128_get64(int128_sub(section->size, int128_one())));
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region", __func__);
> +        return;
> +    }
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + int128_get64(section->size)) &
> +          TARGET_PAGE_MASK;
> +
> +    if (iova >= end) {
> +        return;
> +    }
> +
> +    DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
> +            iova, end - 1);
> +
> +    ret = vfio_dma_unmap(container, iova, end - iova);
> +    memory_region_unref(section->mr);
> +    if (ret) {
> +        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> +                     "0x%"HWADDR_PRIx") = %d (%m)",
> +                     container, iova, end - iova, ret);
> +    }
> +}
> +
> +static MemoryListener vfio_memory_listener = {
> +    .region_add = vfio_listener_region_add,
> +    .region_del = vfio_listener_region_del,
> +};
> +
> +static void vfio_listener_release(VFIOContainer *container)
> +{
> +    memory_listener_unregister(&container->iommu_data.type1.listener);
> +}
> +
> +static void vfio_kvm_device_add_group(VFIOGroup *group)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_device_attr attr = {
> +        .group = KVM_DEV_VFIO_GROUP,
> +        .attr = KVM_DEV_VFIO_GROUP_ADD,
> +        .addr = (uint64_t)(unsigned long)&group->fd,
> +    };
> +
> +    if (!kvm_enabled()) {
> +        return;
> +    }
> +
> +    if (vfio_kvm_device_fd < 0) {
> +        struct kvm_create_device cd = {
> +            .type = KVM_DEV_TYPE_VFIO,
> +        };
> +
> +        if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
> +            DPRINTF("KVM_CREATE_DEVICE: %m\n");
> +            return;
> +        }
> +
> +        vfio_kvm_device_fd = cd.fd;
> +    }
> +
> +    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> +        error_report("Failed to add group %d to KVM VFIO device: %m",
> +                     group->groupid);
> +    }
> +#endif
> +}
> +
> +static void vfio_kvm_device_del_group(VFIOGroup *group)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_device_attr attr = {
> +        .group = KVM_DEV_VFIO_GROUP,
> +        .attr = KVM_DEV_VFIO_GROUP_DEL,
> +        .addr = (uint64_t)(unsigned long)&group->fd,
> +    };
> +
> +    if (vfio_kvm_device_fd < 0) {
> +        return;
> +    }
> +
> +    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> +        error_report("Failed to remove group %d from KVM VFIO device: %m",
> +                     group->groupid);
> +    }
> +#endif
> +}
> +
> +static int vfio_connect_container(VFIOGroup *group)
> +{
> +    VFIOContainer *container;
> +    int ret, fd;
> +
> +    if (group->container) {
> +        return 0;
> +    }
> +
> +    QLIST_FOREACH(container, &container_list, next) {
> +        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> +            group->container = container;
> +            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> +            return 0;
> +        }
> +    }
> +
> +    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> +    if (fd < 0) {
> +        error_report("vfio: failed to open /dev/vfio/vfio: %m");
> +        return -errno;
> +    }
> +
> +    ret = ioctl(fd, VFIO_GET_API_VERSION);
> +    if (ret != VFIO_API_VERSION) {
> +        error_report("vfio: supported vfio version: %d, "
> +                     "reported version: %d", VFIO_API_VERSION, ret);
> +        close(fd);
> +        return -EINVAL;
> +    }
> +
> +    container = g_malloc0(sizeof(*container));
> +    container->fd = fd;
> +
> +    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %m");
> +            g_free(container);
> +            close(fd);
> +            return -errno;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %m");
> +            g_free(container);
> +            close(fd);
> +            return -errno;
> +        }
> +
> +        container->iommu_data.type1.listener = vfio_memory_listener;
> +        container->iommu_data.release = vfio_listener_release;
> +
> +        memory_listener_register(&container->iommu_data.type1.listener,
> +                                 &address_space_memory);
> +
> +        if (container->iommu_data.type1.error) {
> +            ret = container->iommu_data.type1.error;
> +            vfio_listener_release(container);
> +            g_free(container);
> +            close(fd);
> +            error_report("vfio: memory listener initialization failed for container");
> +            return ret;
> +        }
> +
> +        container->iommu_data.type1.initialized = true;
> +
> +    } else {
> +        error_report("vfio: No available IOMMU models");
> +        g_free(container);
> +        close(fd);
> +        return -EINVAL;
> +    }
> +
> +    QLIST_INIT(&container->group_list);
> +    QLIST_INSERT_HEAD(&container_list, container, next);
> +
> +    group->container = container;
> +    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> +
> +    return 0;
> +}
> +
> +static void vfio_disconnect_container(VFIOGroup *group)
> +{
> +    VFIOContainer *container = group->container;
> +
> +    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> +        error_report("vfio: error disconnecting group %d from container",
> +                     group->groupid);
> +    }
> +
> +    QLIST_REMOVE(group, container_next);
> +    group->container = NULL;
> +
> +    if (QLIST_EMPTY(&container->group_list)) {
> +        if (container->iommu_data.release) {
> +            container->iommu_data.release(container);
> +        }
> +        QLIST_REMOVE(container, next);
> +        DPRINTF("vfio_disconnect_container: close container->fd\n");
> +        close(container->fd);
> +        g_free(container);
> +    }
> +}
> +
> +VFIOGroup *vfio_get_group(int groupid, QEMUResetHandler *reset_handler)
> +{
> +    VFIOGroup *group;
> +    char path[32];
> +    struct vfio_group_status status = { .argsz = sizeof(status) };
> +
> +    QLIST_FOREACH(group, &group_list, next) {
> +        if (group->groupid == groupid) {
> +            return group;
> +        }
> +    }
> +
> +    group = g_malloc0(sizeof(*group));
> +
> +    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> +    group->fd = qemu_open(path, O_RDWR);
> +    if (group->fd < 0) {
> +        error_report("vfio: error opening %s: %m", path);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> +        error_report("vfio: error getting group status: %m");
> +        close(group->fd);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> +        error_report("vfio: error, group %d is not viable, please ensure "
> +                     "all devices within the iommu_group are bound to their "
> +                     "vfio bus driver.", groupid);
> +        close(group->fd);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    group->groupid = groupid;
> +    QLIST_INIT(&group->device_list);
> +
> +    if (vfio_connect_container(group)) {
> +        error_report("vfio: failed to setup container for group %d", groupid);
> +        close(group->fd);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    if (QLIST_EMPTY(&group_list) && reset_handler) {
> +        qemu_register_reset(reset_handler, NULL);
> +    }
> +
> +    QLIST_INSERT_HEAD(&group_list, group, next);
> +
> +    vfio_kvm_device_add_group(group);
> +
> +    return group;
> +}
> +
> +void vfio_put_group(VFIOGroup *group, QEMUResetHandler *reset_handler)
> +{
> +    if (!QLIST_EMPTY(&group->device_list)) {
> +        return;
> +    }
> +
> +    vfio_kvm_device_del_group(group);
> +    vfio_disconnect_container(group);
> +    QLIST_REMOVE(group, next);
> +    DPRINTF("vfio_put_group: close group->fd\n");
> +    close(group->fd);
> +    g_free(group);
> +
> +    if (QLIST_EMPTY(&group_list) && reset_handler) {
> +        qemu_unregister_reset(reset_handler, NULL);
> +    }

The reset_handler stuff needs work.  We can theoretically support both
PCI and platform devices simultaneously, but this would only allow one
to register a reset handler and one to remove it (which may not be the
same one). 

> +}
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 9cf5b84..9e70d68 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1,5 +1,5 @@
>  /*
> - * vfio based device assignment support
> + * vfio based device assignment support - PCI devices
>   *
>   * Copyright Red Hat, Inc. 2012
>   *
> @@ -40,6 +40,8 @@
>  #include "sysemu/kvm.h"
>  #include "sysemu/sysemu.h"

There were probably some includes that could be cleaned out with the
split.

> +#include "vfio-common.h"
> +
>  /* #define DEBUG_VFIO */
>  #ifdef DEBUG_VFIO
>  #define DPRINTF(fmt, ...) \

DEBUG_VFIO is duplicated here... move to vfio-common.h

> @@ -55,6 +57,8 @@
>  #define VFIO_ALLOW_KVM_MSI 1
>  #define VFIO_ALLOW_KVM_MSIX 1
>  
> +extern QLIST_HEAD(, VFIOGroup) group_list;

Define in vfio-common.h?

> +
>  struct VFIODevice;
>  
>  typedef struct VFIOQuirk {
> @@ -135,25 +139,6 @@ enum {
>  
>  struct VFIOGroup;
>  
> -typedef struct VFIOType1 {
> -    MemoryListener listener;
> -    int error;
> -    bool initialized;
> -} VFIOType1;
> -
> -typedef struct VFIOContainer {
> -    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> -    struct {
> -        /* enable abstraction to support various iommu backends */
> -        union {
> -            VFIOType1 type1;
> -        };
> -        void (*release)(struct VFIOContainer *);
> -    } iommu_data;
> -    QLIST_HEAD(, VFIOGroup) group_list;
> -    QLIST_ENTRY(VFIOContainer) next;
> -} VFIOContainer;
> -
>  /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
>  typedef struct VFIOMSIXInfo {
>      uint8_t table_bar;
> @@ -200,15 +185,6 @@ typedef struct VFIODevice {
>      bool rom_read_failed;
>  } VFIODevice;
>  
> -typedef struct VFIOGroup {
> -    int fd;
> -    int groupid;
> -    VFIOContainer *container;
> -    QLIST_HEAD(, VFIODevice) device_list;
> -    QLIST_ENTRY(VFIOGroup) next;
> -    QLIST_ENTRY(VFIOGroup) container_next;
> -} VFIOGroup;
> -
>  typedef struct VFIORomBlacklistEntry {
>      uint16_t vendor_id;
>      uint16_t device_id;
> @@ -234,23 +210,6 @@ static const VFIORomBlacklistEntry romblacklist[] = {
>  
>  #define MSIX_CAP_LENGTH 12
>  
> -static QLIST_HEAD(, VFIOContainer)
> -    container_list = QLIST_HEAD_INITIALIZER(container_list);
> -
> -static QLIST_HEAD(, VFIOGroup)
> -    group_list = QLIST_HEAD_INITIALIZER(group_list);
> -
> -#ifdef CONFIG_KVM
> -/*
> - * We have a single VFIO pseudo device per KVM VM.  Once created it lives
> - * for the life of the VM.  Closing the file descriptor only drops our
> - * reference to it and the device's reference to kvm.  Therefore once
> - * initialized, this file descriptor is only released on QEMU exit and
> - * we'll re-use it should another vfio device be attached before then.
> - */
> -static int vfio_kvm_device_fd = -1;
> -#endif
> -
>  static void vfio_disable_interrupts(VFIODevice *vdev);
>  static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>  static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
> @@ -2180,183 +2139,6 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>  }
>  
>  /*
> - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> - */
> -static int vfio_dma_unmap(VFIOContainer *container,
> -                          hwaddr iova, ram_addr_t size)
> -{
> -    struct vfio_iommu_type1_dma_unmap unmap = {
> -        .argsz = sizeof(unmap),
> -        .flags = 0,
> -        .iova = iova,
> -        .size = size,
> -    };
> -
> -    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> -        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
> -        return -errno;
> -    }
> -
> -    return 0;
> -}
> -
> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> -                        ram_addr_t size, void *vaddr, bool readonly)
> -{
> -    struct vfio_iommu_type1_dma_map map = {
> -        .argsz = sizeof(map),
> -        .flags = VFIO_DMA_MAP_FLAG_READ,
> -        .vaddr = (__u64)(uintptr_t)vaddr,
> -        .iova = iova,
> -        .size = size,
> -    };
> -
> -    if (!readonly) {
> -        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
> -    }
> -
> -    /*
> -     * Try the mapping, if it fails with EBUSY, unmap the region and try
> -     * again.  This shouldn't be necessary, but we sometimes see it in
> -     * the the VGA ROM space.
> -     */
> -    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> -         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
> -        return 0;
> -    }
> -
> -    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
> -    return -errno;
> -}
> -
> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> -{
> -    return !memory_region_is_ram(section->mr) ||
> -           /*
> -            * Sizing an enabled 64-bit BAR can cause spurious mappings to
> -            * addresses in the upper part of the 64-bit address space.  These
> -            * are never accessed by the CPU and beyond the address width of
> -            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
> -            */
> -           section->offset_within_address_space & (1ULL << 63);
> -}
> -
> -static void vfio_listener_region_add(MemoryListener *listener,
> -                                     MemoryRegionSection *section)
> -{
> -    VFIOContainer *container = container_of(listener, VFIOContainer,
> -                                            iommu_data.type1.listener);
> -    hwaddr iova, end;
> -    void *vaddr;
> -    int ret;
> -
> -    assert(!memory_region_is_iommu(section->mr));
> -
> -    if (vfio_listener_skipped_section(section)) {
> -        DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
> -                section->offset_within_address_space,
> -                section->offset_within_address_space +
> -                int128_get64(int128_sub(section->size, int128_one())));
> -        return;
> -    }
> -
> -    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> -                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> -        error_report("%s received unaligned region", __func__);
> -        return;
> -    }
> -
> -    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> -    end = (section->offset_within_address_space + int128_get64(section->size)) &
> -          TARGET_PAGE_MASK;
> -
> -    if (iova >= end) {
> -        return;
> -    }
> -
> -    vaddr = memory_region_get_ram_ptr(section->mr) +
> -            section->offset_within_region +
> -            (iova - section->offset_within_address_space);
> -
> -    DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
> -            iova, end - 1, vaddr);
> -
> -    memory_region_ref(section->mr);
> -    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
> -    if (ret) {
> -        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> -                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
> -                     container, iova, end - iova, vaddr, ret);
> -
> -        /*
> -         * On the initfn path, store the first error in the container so we
> -         * can gracefully fail.  Runtime, there's not much we can do other
> -         * than throw a hardware error.
> -         */
> -        if (!container->iommu_data.type1.initialized) {
> -            if (!container->iommu_data.type1.error) {
> -                container->iommu_data.type1.error = ret;
> -            }
> -        } else {
> -            hw_error("vfio: DMA mapping failed, unable to continue");
> -        }
> -    }
> -}
> -
> -static void vfio_listener_region_del(MemoryListener *listener,
> -                                     MemoryRegionSection *section)
> -{
> -    VFIOContainer *container = container_of(listener, VFIOContainer,
> -                                            iommu_data.type1.listener);
> -    hwaddr iova, end;
> -    int ret;
> -
> -    if (vfio_listener_skipped_section(section)) {
> -        DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
> -                section->offset_within_address_space,
> -                section->offset_within_address_space +
> -                int128_get64(int128_sub(section->size, int128_one())));
> -        return;
> -    }
> -
> -    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> -                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> -        error_report("%s received unaligned region", __func__);
> -        return;
> -    }
> -
> -    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> -    end = (section->offset_within_address_space + int128_get64(section->size)) &
> -          TARGET_PAGE_MASK;
> -
> -    if (iova >= end) {
> -        return;
> -    }
> -
> -    DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
> -            iova, end - 1);
> -
> -    ret = vfio_dma_unmap(container, iova, end - iova);
> -    memory_region_unref(section->mr);
> -    if (ret) {
> -        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> -                     "0x%"HWADDR_PRIx") = %d (%m)",
> -                     container, iova, end - iova, ret);
> -    }
> -}
> -
> -static MemoryListener vfio_memory_listener = {
> -    .region_add = vfio_listener_region_add,
> -    .region_del = vfio_listener_region_del,
> -};
> -
> -static void vfio_listener_release(VFIOContainer *container)
> -{
> -    memory_listener_unregister(&container->iommu_data.type1.listener);
> -}
> -
> -/*
>   * Interrupt setup
>   */
>  static void vfio_disable_interrupts(VFIODevice *vdev)
> @@ -3221,244 +3003,8 @@ static void vfio_pci_reset_handler(void *opaque)
>      }
>  }
>  
> -static void vfio_kvm_device_add_group(VFIOGroup *group)
> -{
> -#ifdef CONFIG_KVM
> -    struct kvm_device_attr attr = {
> -        .group = KVM_DEV_VFIO_GROUP,
> -        .attr = KVM_DEV_VFIO_GROUP_ADD,
> -        .addr = (uint64_t)(unsigned long)&group->fd,
> -    };
> -
> -    if (!kvm_enabled()) {
> -        return;
> -    }
> -
> -    if (vfio_kvm_device_fd < 0) {
> -        struct kvm_create_device cd = {
> -            .type = KVM_DEV_TYPE_VFIO,
> -        };
> -
> -        if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
> -            DPRINTF("KVM_CREATE_DEVICE: %m\n");
> -            return;
> -        }
> -
> -        vfio_kvm_device_fd = cd.fd;
> -    }
> -
> -    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> -        error_report("Failed to add group %d to KVM VFIO device: %m",
> -                     group->groupid);
> -    }
> -#endif
> -}
> -
> -static void vfio_kvm_device_del_group(VFIOGroup *group)
> -{
> -#ifdef CONFIG_KVM
> -    struct kvm_device_attr attr = {
> -        .group = KVM_DEV_VFIO_GROUP,
> -        .attr = KVM_DEV_VFIO_GROUP_DEL,
> -        .addr = (uint64_t)(unsigned long)&group->fd,
> -    };
> -
> -    if (vfio_kvm_device_fd < 0) {
> -        return;
> -    }
> -
> -    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> -        error_report("Failed to remove group %d from KVM VFIO device: %m",
> -                     group->groupid);
> -    }
> -#endif
> -}
> -
> -static int vfio_connect_container(VFIOGroup *group)
> -{
> -    VFIOContainer *container;
> -    int ret, fd;
> -
> -    if (group->container) {
> -        return 0;
> -    }
> -
> -    QLIST_FOREACH(container, &container_list, next) {
> -        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> -            group->container = container;
> -            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> -            return 0;
> -        }
> -    }
> -
> -    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> -    if (fd < 0) {
> -        error_report("vfio: failed to open /dev/vfio/vfio: %m");
> -        return -errno;
> -    }
> -
> -    ret = ioctl(fd, VFIO_GET_API_VERSION);
> -    if (ret != VFIO_API_VERSION) {
> -        error_report("vfio: supported vfio version: %d, "
> -                     "reported version: %d", VFIO_API_VERSION, ret);
> -        close(fd);
> -        return -EINVAL;
> -    }
> -
> -    container = g_malloc0(sizeof(*container));
> -    container->fd = fd;
> -
> -    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> -        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> -        if (ret) {
> -            error_report("vfio: failed to set group container: %m");
> -            g_free(container);
> -            close(fd);
> -            return -errno;
> -        }
> -
> -        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> -        if (ret) {
> -            error_report("vfio: failed to set iommu for container: %m");
> -            g_free(container);
> -            close(fd);
> -            return -errno;
> -        }
> -
> -        container->iommu_data.type1.listener = vfio_memory_listener;
> -        container->iommu_data.release = vfio_listener_release;
> -
> -        memory_listener_register(&container->iommu_data.type1.listener,
> -                                 &address_space_memory);
> -
> -        if (container->iommu_data.type1.error) {
> -            ret = container->iommu_data.type1.error;
> -            vfio_listener_release(container);
> -            g_free(container);
> -            close(fd);
> -            error_report("vfio: memory listener initialization failed for container");
> -            return ret;
> -        }
> -
> -        container->iommu_data.type1.initialized = true;
> -
> -    } else {
> -        error_report("vfio: No available IOMMU models");
> -        g_free(container);
> -        close(fd);
> -        return -EINVAL;
> -    }
> -
> -    QLIST_INIT(&container->group_list);
> -    QLIST_INSERT_HEAD(&container_list, container, next);
> -
> -    group->container = container;
> -    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> -
> -    return 0;
> -}
> -
> -static void vfio_disconnect_container(VFIOGroup *group)
> -{
> -    VFIOContainer *container = group->container;
> -
> -    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> -        error_report("vfio: error disconnecting group %d from container",
> -                     group->groupid);
> -    }
> -
> -    QLIST_REMOVE(group, container_next);
> -    group->container = NULL;
> -
> -    if (QLIST_EMPTY(&container->group_list)) {
> -        if (container->iommu_data.release) {
> -            container->iommu_data.release(container);
> -        }
> -        QLIST_REMOVE(container, next);
> -        DPRINTF("vfio_disconnect_container: close container->fd\n");
> -        close(container->fd);
> -        g_free(container);
> -    }
> -}
> -
> -static VFIOGroup *vfio_get_group(int groupid)
> -{
> -    VFIOGroup *group;
> -    char path[32];
> -    struct vfio_group_status status = { .argsz = sizeof(status) };
> -
> -    QLIST_FOREACH(group, &group_list, next) {
> -        if (group->groupid == groupid) {
> -            return group;
> -        }
> -    }
> -
> -    group = g_malloc0(sizeof(*group));
> -
> -    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> -    group->fd = qemu_open(path, O_RDWR);
> -    if (group->fd < 0) {
> -        error_report("vfio: error opening %s: %m", path);
> -        g_free(group);
> -        return NULL;
> -    }
> -
> -    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> -        error_report("vfio: error getting group status: %m");
> -        close(group->fd);
> -        g_free(group);
> -        return NULL;
> -    }
> -
> -    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> -        error_report("vfio: error, group %d is not viable, please ensure "
> -                     "all devices within the iommu_group are bound to their "
> -                     "vfio bus driver.", groupid);
> -        close(group->fd);
> -        g_free(group);
> -        return NULL;
> -    }
> -
> -    group->groupid = groupid;
> -    QLIST_INIT(&group->device_list);
> -
> -    if (vfio_connect_container(group)) {
> -        error_report("vfio: failed to setup container for group %d", groupid);
> -        close(group->fd);
> -        g_free(group);
> -        return NULL;
> -    }
> -
> -    if (QLIST_EMPTY(&group_list)) {
> -        qemu_register_reset(vfio_pci_reset_handler, NULL);
> -    }
> -
> -    QLIST_INSERT_HEAD(&group_list, group, next);
> -
> -    vfio_kvm_device_add_group(group);
> -
> -    return group;
> -}
> -
> -static void vfio_put_group(VFIOGroup *group)
> -{
> -    if (!QLIST_EMPTY(&group->device_list)) {
> -        return;
> -    }
> -
> -    vfio_kvm_device_del_group(group);
> -    vfio_disconnect_container(group);
> -    QLIST_REMOVE(group, next);
> -    DPRINTF("vfio_put_group: close group->fd\n");
> -    close(group->fd);
> -    g_free(group);
> -
> -    if (QLIST_EMPTY(&group_list)) {
> -        qemu_unregister_reset(vfio_pci_reset_handler, NULL);
> -    }
> -}
> -
> -static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
> +static int vfio_get_device(VFIOGroup *group, const char *name,
> +                           struct VFIODevice *vdev)

Why?

>  {
>      struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>      struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> @@ -3485,7 +3031,7 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
>          goto error;
>      }
>  
> -    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
> +    DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name,

???

>              dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
>  
>      if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> @@ -3768,7 +3314,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
>              vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
>  
> -    group = vfio_get_group(groupid);
> +    group = vfio_get_group(groupid, vfio_pci_reset_handler);
>      if (!group) {
>          error_report("vfio: failed to get group %d", groupid);
>          return -ENOENT;
> @@ -3785,7 +3331,7 @@ static int vfio_initfn(PCIDevice *pdev)
>              pvdev->host.function == vdev->host.function) {
>  
>              error_report("vfio: error: device %s is already attached", path);
> -            vfio_put_group(group);
> +            vfio_put_group(group, vfio_pci_reset_handler);
>              return -EBUSY;
>          }
>      }
> @@ -3793,7 +3339,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      ret = vfio_get_device(group, path, vdev);
>      if (ret) {
>          error_report("vfio: failed to get device %s", path);
> -        vfio_put_group(group);
> +        vfio_put_group(group, vfio_pci_reset_handler);
>          return ret;
>      }
>  
> @@ -3879,7 +3425,7 @@ out_teardown:
>  out_put:
>      g_free(vdev->emulated_config_bits);
>      vfio_put_device(vdev);
> -    vfio_put_group(group);
> +    vfio_put_group(group, vfio_pci_reset_handler);
>      return ret;
>  }
>  
> @@ -3899,7 +3445,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>      g_free(vdev->emulated_config_bits);
>      g_free(vdev->rom);
>      vfio_put_device(vdev);
> -    vfio_put_group(group);
> +    vfio_put_group(group, vfio_pci_reset_handler);
>  }
>  
>  static void vfio_pci_reset(DeviceState *dev)
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> new file mode 100644
> index 0000000..138fb13
> --- /dev/null
> +++ b/hw/vfio/platform.c
> @@ -0,0 +1,381 @@
> +/*
> + * vfio based device assignment support - platform devices
> + *
> + * Copyright Linaro Limited, 2014
> + *
> + * Authors:
> + *  Kim Phillips <kim.phillips@xxxxxxxxxx>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on vfio based PCI device assignment support:
> + *  Copyright Red Hat, Inc. 2012
> + */
> +
> +#include <dirent.h>
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +
> +#include "config.h"
> +#include "exec/address-spaces.h"
> +#include "exec/memory.h"
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/event_notifier.h"
> +#include "qemu/queue.h"
> +#include "qemu/range.h"
> +#include "sysemu/kvm.h"
> +#include "sysemu/sysemu.h"
> +#include "hw/qdev-properties.h"
> +#include "migration/vmstate.h"
> +#include "hw/hw.h"
> +#include "hw/sysbus.h"
> +
> +#include "vfio-common.h"
> +
> +#define DEBUG_VFIO
> +#ifdef DEBUG_VFIO
> +#define DPRINTF(fmt, ...) \
> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +/* Extra debugging, trap acceleration paths for more logging */
> +#define VFIO_ALLOW_MMAP 1

Maybe there should be a common debug section in vfio-common.h?

> +
> +#define TYPE_VFIO_PLATFORM "vfio-platform"
> +
> +typedef struct VFIORegion {
> +    off_t fd_offset; /* offset of region within device fd */
> +    int fd; /* device fd, allows us to pass VFIORegion as opaque data */
> +    MemoryRegion mem; /* slow, read/write access */
> +    MemoryRegion mmap_mem; /* direct mapped access */
> +    void *mmap;
> +    size_t size;
> +    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
> +    uint8_t nr; /* cache the region number for debug */
> +} VFIORegion;
> +
> +typedef struct VFIODevice {
> +    SysBusDevice sbdev;
> +    int fd;
> +    int num_regions;
> +    VFIORegion *regions;
> +    QLIST_ENTRY(VFIODevice) next;
> +    struct VFIOGroup *group;
> +    char *name;
> +} VFIODevice;

I suspect we really need:

(in vfio-common.h)
typedef struct VFIODevice {
	QLIST_ENTRY(VFIODevice) next;
	struct VFIOGroup *group;
	enum VFIODeviceType;
	/* maybe some of the other common stuff */
} VFIODevice;

(per pci/platform)
typedef struct VFIOPlatformDevice {
	VFIODevice vdev;
	/* device specific stuff */
} VFIOPlatformDevice;

Otherwise I don't see how the device_list on VFIOGroup works.

> +
> +static int vfio_mmap_region(VFIODevice *vdev, VFIORegion *region,
> +                         MemoryRegion *mem, MemoryRegion *submem,
> +                         void **map, size_t size, off_t offset,
> +                         const char *name)
> +{
> +    int ret = 0;
> +
> +    if (VFIO_ALLOW_MMAP && size && region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
> +        int prot = 0;
> +        ret = 0;
> +
> +        if (region->flags & VFIO_REGION_INFO_FLAG_READ) {
> +            prot |= PROT_READ;
> +        }
> +
> +        if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) {
> +            prot |= PROT_WRITE;
> +        }
> +
> +        *map = mmap(NULL, size, prot, MAP_SHARED,
> +                    region->fd, region->fd_offset + offset);
> +        if (*map == MAP_FAILED) {
> +            ret = -errno;
> +            *map = NULL;
> +            goto error;
> +        }
> +
> +        memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
> +    }
> +
> +    memory_region_add_subregion(mem, offset, submem);
> +
> +error:
> +    return ret;
> +}
> +
> +/*
> + * IO Port/MMIO - Beware of the endians, VFIO is always little endian
> + */
> +static void vfio_region_write(void *opaque, hwaddr addr,
> +                              uint64_t data, unsigned size)
> +{
> +    VFIORegion *region = opaque;
> +    union {
> +        uint8_t byte;
> +        uint16_t word;
> +        uint32_t dword;
> +        uint64_t qword;
> +    } buf;
> +
> +    switch (size) {
> +    case 1:
> +        buf.byte = data;
> +        break;
> +    case 2:
> +        buf.word = data;
> +        break;
> +    case 4:
> +        buf.dword = data;
> +        break;
> +    default:
> +        hw_error("vfio: unsupported write size, %d bytes\n", size);
> +        break;
> +    }
> +
> +    if (pwrite(region->fd, &buf, size, region->fd_offset + addr) != size) {
> +        error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
> +                     __func__, addr, data, size);
> +    }
> +
> +    DPRINTF("%s(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
> +             __func__, region->nr, addr, data, size);
> +}
> +
> +static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    VFIORegion *region = opaque;
> +    union {
> +        uint8_t byte;
> +        uint16_t word;
> +        uint32_t dword;
> +        uint64_t qword;
> +    } buf;
> +    uint64_t data = 0;
> +
> +    if (pread(region->fd, &buf, size, region->fd_offset + addr) != size) {
> +        error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
> +                     __func__, addr, size);
> +        return (uint64_t)-1;
> +    }
> +
> +    switch (size) {
> +    case 1:
> +        data = buf.byte;
> +        break;
> +    case 2:
> +        data = buf.word;
> +        break;
> +    case 4:
> +        data = buf.dword;
> +        break;
> +    default:
> +        hw_error("vfio: unsupported read size, %d bytes\n", size);
> +        break;
> +    }
> +
> +    DPRINTF("%s(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
> +            __func__, region->nr, addr, size, data);
> +
> +    return data;
> +}
> +
> +static const MemoryRegionOps vfio_region_ops = {
> +    .read = vfio_region_read,
> +    .write = vfio_region_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +static void vfio_map_region(VFIODevice *vdev, int nr)
> +{
> +    VFIORegion *region = &vdev->regions[nr];
> +    unsigned size = region->size;
> +    char name[64];
> +
> +    snprintf(name, sizeof(name), "VFIO %s region %d", vdev->name, nr);
> +
> +    /* A "slow" read/write mapping underlies all regions  */
> +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
> +                          region, name, size);
> +
> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> +    if (vfio_mmap_region(vdev, region, &region->mem,
> +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
> +        error_report("%s unsupported. Performance may be slow", name);
> +    }
> +}
> +
> +static int vfio_get_device(VFIOGroup *group, const char *name,
> +                           struct VFIODevice *vdev)
> +{
> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> +    int ret, i;
> +
> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +    if (ret < 0) {
> +        error_report("vfio: error getting device %s from group %d: %m",
> +                     name, group->groupid);
> +        error_printf("Verify all devices in group %d are bound to the vfio "
> +                     "platform driver and are not already in use\n",
> +                     group->groupid);
> +        return ret;
> +    }
> +
> +    vdev->fd = ret;
> +    vdev->group = group;
> +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> +
> +    /* Sanity check device */
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
> +    if (ret) {
> +        error_report("vfio: error getting device info: %m");
> +        goto error;
> +    }
> +
> +    DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name,
> +            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
> +
> +    vdev->regions = g_malloc0(sizeof(VFIORegion) * dev_info.num_regions);
> +    if (!vdev->regions) {
> +            error_report("vfio: Error allocating space for %d regions",
> +                         dev_info.num_regions);
> +            ret = -ENOMEM;
> +            goto error;
> +    }
> +
> +    vdev->num_regions = dev_info.num_regions;
> +
> +    for (i = 0; i < dev_info.num_regions; i++) {
> +        reg_info.index = i;
> +
> +        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> +        if (ret) {
> +            error_report("vfio: Error getting region %d info: %m", i);
> +            goto error;
> +        }
> +
> +        DPRINTF("Device %s region %d:\n", name, i);
> +        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> +                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> +                (unsigned long)reg_info.flags);
> +
> +        vdev->regions[i].flags = reg_info.flags;
> +        vdev->regions[i].size = reg_info.size;
> +        vdev->regions[i].fd_offset = reg_info.offset;
> +        vdev->regions[i].fd = vdev->fd;
> +        vdev->regions[i].nr = i;
> +    }

Perhaps more of this could be in common.c if we had a generic VFIODevice
embedded in a VFIOPlatformDevice or VFIOPCIDevice.  We could fill in
high-level things like number of regions/irqs, fd, name, etc, leaving
interpretation and probing of each region/irq to device specific code.

> +
> +error:
> +    if (ret) {
> +        g_free(vdev->regions);
> +        QLIST_REMOVE(vdev, next);
> +        vdev->group = NULL;
> +        close(vdev->fd);
> +    }
> +    return ret;
> +}
> +
> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> +{
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev);
> +    VFIOGroup *group;
> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> +    ssize_t len;
> +    struct stat st;
> +    int groupid, i, ret;
> +
> +    /* TODO: pass device name on command line */
> +    vdev->name = malloc(PATH_MAX);
> +    strcpy(vdev->name, "fff51000.ethernet");
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/", vdev->name);
> +    if (stat(path, &st) < 0) {
> +        error_report("vfio: error: no such host device: %s", path);
> +        return;
> +    }
> +
> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> +
> +    len = readlink(path, iommu_group_path, PATH_MAX);
> +    if (len <= 0) {
> +        error_report("vfio: error no iommu_group for device");
> +        return;
> +    }
> +
> +    iommu_group_path[len] = 0;
> +    group_name = basename(iommu_group_path);
> +
> +    if (sscanf(group_name, "%d", &groupid) != 1) {
> +        error_report("vfio: error reading %s: %m", path);
> +        return;
> +    }
> +
> +    DPRINTF("%s(%s) group %d\n", __func__, vdev->name, groupid);
> +
> +    group = vfio_get_group(groupid, NULL);
> +    if (!group) {
> +        error_report("vfio: failed to get group %d", groupid);
> +        return;
> +    }
> +
> +    snprintf(path, sizeof(path), "%s", vdev->name);
> +
> +    QLIST_FOREACH(pvdev, &group->device_list, next) {
> +        if (strcmp(pvdev->name, vdev->name) == 0) {
> +            error_report("vfio: error: device %s is already attached", path);
> +            vfio_put_group(group, NULL);
> +            return;
> +        }
> +    }
> +
> +    ret = vfio_get_device(group, path, vdev);
> +    if (ret) {
> +        error_report("vfio: failed to get device %s", path);
> +        vfio_put_group(group, NULL);
> +        return;
> +    }
> +
> +    for (i = 0; i < vdev->num_regions; i++) {
> +        vfio_map_region(vdev, i);
> +        sysbus_init_mmio(sbdev, &vdev->regions[i].mem);
> +    }
> +}
> +
> +static const VMStateDescription vfio_platform_vmstate = {
> +    .name = TYPE_VFIO_PLATFORM,
> +    .unmigratable = 1,
> +};
> +
> +static void vfio_platform_dev_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = vfio_platform_realize;
> +    dc->vmsd = &vfio_platform_vmstate;
> +    dc->desc = "VFIO-based platform device assignment";
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo vfio_platform_dev_info = {
> +    .name = TYPE_VFIO_PLATFORM,
> +    .parent = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(VFIODevice),
> +    .class_init = vfio_platform_dev_class_init,
> +};
> +
> +static void register_vfio_platform_dev_type(void)
> +{
> +    type_register_static(&vfio_platform_dev_info);
> +}
> +
> +type_init(register_vfio_platform_dev_type)
> diff --git a/hw/vfio/vfio-common.h b/hw/vfio/vfio-common.h
> new file mode 100644
> index 0000000..21148ef
> --- /dev/null
> +++ b/hw/vfio/vfio-common.h
> @@ -0,0 +1,55 @@
> +/*
> + * common header for vfio based device assignment support
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@xxxxxxxxxx>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@xxxxxxxxxxxx)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@xxxxxxxxxxxx)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@xxxxxxxxxxxx)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@xxxxxxxxxx)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@xxxxxxxxxx)
> + */
> +
> +struct VFIODevice;
> +
> +struct VFIOGroup;
> +
> +typedef struct VFIOType1 {
> +    MemoryListener listener;
> +    int error;
> +    bool initialized;
> +} VFIOType1;
> +
> +typedef struct VFIOContainer {
> +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> +    struct {
> +        /* enable abstraction to support various iommu backends */
> +        union {
> +            VFIOType1 type1;
> +        };
> +        void (*release)(struct VFIOContainer *);
> +    } iommu_data;
> +    QLIST_HEAD(, VFIOGroup) group_list;
> +    QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +
> +typedef struct VFIOGroup {
> +    int fd;
> +    int groupid;
> +    VFIOContainer *container;
> +    QLIST_HEAD(, VFIODevice) device_list;
> +    QLIST_ENTRY(VFIOGroup) next;
> +    QLIST_ENTRY(VFIOGroup) container_next;
> +} VFIOGroup;
> +
> +
> +VFIOGroup *vfio_get_group(int groupid, QEMUResetHandler *reset_handler);
> +void vfio_put_group(VFIOGroup *group, QEMUResetHandler *reset_handler);

_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm