> -----Original Message----- > From: Alexander Graf [mailto:agraf@xxxxxxx] > Sent: Monday, August 11, 2014 3:06 PM > To: Eric Auger; eric.auger@xxxxxx; christoffer.dall@xxxxxxxxxx; qemu- > devel@xxxxxxxxxx; Phillips Kim-R1AAHA; a.rigo@xxxxxxxxxxxxxxxxxxxxxx > Cc: will.deacon@xxxxxxx; kvmarm@xxxxxxxxxxxxxxxxxxxxx; > alex.williamson@xxxxxxxxxx; Bhushan Bharat-R65777; peter.maydell@xxxxxxxxxx; > Yoder Stuart-B08248; a.motakis@xxxxxxxxxxxxxxxxxxxxxx; patches@xxxxxxxxxx; > joel.schopp@xxxxxxx; Kim Phillips > Subject: Re: [PATCH v5 07/10] hw/vfio/platform: add vfio-platform support > > > On 09.08.14 16:25, Eric Auger wrote: > > Minimal VFIO platform implementation supporting > > - register space user mapping, > > - IRQ assignment based on eventfds handled on qemu side. > > > > irqfd kernel acceleration comes in a subsequent patch. > > > > Signed-off-by: Kim Phillips <kim.phillips@xxxxxxxxxx> > > Signed-off-by: Eric Auger <eric.auger@xxxxxxxxxx> > > > > --- > > > > v4 -> v5: > > - vfio-plaform.h included first > > - cleanup error handling in *populate*, vfio_get_device, > > vfio_enable_intp > > - vfio_put_device not called anymore > > - add some includes to follow vfio policy > > > > v3 -> v4: > > [Eric Auger] > > - merge of "vfio: Add initial IRQ support in platform device" > > to get a full functional patch although perfs are limited. > > - removal of unrealize function since I currently understand > > it is only used with device hot-plug feature. > > > > v2 -> v3: > > [Eric Auger] > > - further factorization between PCI and platform (VFIORegion, > > VFIODevice). same level of functionality. > > > > <= v2: > > [Kim Philipps] > > - Initial Creation of the device supporting register space mapping > > --- > > hw/vfio/Makefile.objs | 1 + > > hw/vfio/platform.c | 517 > ++++++++++++++++++++++++++++++++++++++++ > > include/hw/vfio/vfio-platform.h | 77 ++++++ > > 3 files changed, 595 insertions(+) > > create mode 100644 hw/vfio/platform.c > > create mode 100644 include/hw/vfio/vfio-platform.h > > > > diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs > > index e31f30e..c5c76fe 100644 > > --- a/hw/vfio/Makefile.objs > > +++ b/hw/vfio/Makefile.objs > > @@ -1,4 +1,5 @@ > > ifeq ($(CONFIG_LINUX), y) > > obj-$(CONFIG_SOFTMMU) += common.o > > obj-$(CONFIG_PCI) += pci.o > > +obj-$(CONFIG_SOFTMMU) += platform.o > > endif > > diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c > > new file mode 100644 > > index 0000000..f1a1b55 > > --- /dev/null > > +++ b/hw/vfio/platform.c > > @@ -0,0 +1,517 @@ > > +/* > > + * vfio based device assignment support - platform devices > > + * > > + * Copyright Linaro Limited, 2014 > > + * > > + * Authors: > > + * Kim Phillips <kim.phillips@xxxxxxxxxx> > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > + * the COPYING file in the top-level directory. > > + * > > + * Based on vfio based PCI device assignment support: > > + * Copyright Red Hat, Inc. 2012 > > + */ > > + > > +#include <linux/vfio.h> > > +#include <sys/ioctl.h> > > + > > +#include "hw/vfio/vfio-platform.h" > > +#include "qemu/error-report.h" > > +#include "qemu/range.h" > > +#include "sysemu/sysemu.h" > > +#include "exec/memory.h" > > +#include "qemu/queue.h" > > +#include "hw/sysbus.h" > > + > > +extern const MemoryRegionOps vfio_region_ops; > > +extern const MemoryListener vfio_memory_listener; > > +extern QLIST_HEAD(, VFIOGroup) group_list; > > +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces; > > +void vfio_put_device(VFIOPlatformDevice *vdev); > > + > > +/* > > + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice > > + * is not a QOM Object and cannot be passed to memory region functions > > +*/ > > +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr) > > +{ > > + VFIORegion *region = vdev->regions[nr]; > > + unsigned size = region->size; > > + char name[64]; > > + > > + if (!size) { > > + return; > > + } > > + > > + snprintf(name, sizeof(name), "VFIO %s region %d", > > + vdev->vbasedev.name, nr); > > + > > + /* A "slow" read/write mapping underlies all regions */ > > + memory_region_init_io(®ion->mem, OBJECT(vdev), &vfio_region_ops, > > + region, name, size); > > + > > + strncat(name, " mmap", sizeof(name) - strlen(name) - 1); > > + > > + if (vfio_mmap_region(OBJECT(vdev), region, ®ion->mem, > > + ®ion->mmap_mem, ®ion->mmap, size, 0, name)) { > > + error_report("%s unsupported. Performance may be slow", name); > > + } > > +} > > + > > +static void print_regions(VFIOPlatformDevice *vdev) > > +{ > > + int i; > > + > > + DPRINTF("Device \"%s\" counts %d region(s):\n", > > + vdev->vbasedev.name, vdev->vbasedev.num_regions); > > + > > + for (i = 0; i < vdev->vbasedev.num_regions; i++) { > > + DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, " > > + "fd= %d, offset = 0x%lx\n", > > + vdev->regions[i]->nr, > > + (unsigned long)vdev->regions[i]->flags, > > + (unsigned long)vdev->regions[i]->size, > > + vdev->regions[i]->vbasedev->fd, > > + (unsigned long)vdev->regions[i]->fd_offset); > > + } > > +} > > + > > +static int vfio_populate_regions(VFIODevice *vbasedev) > > +{ > > + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; > > + int i, ret = 0; > > + VFIOPlatformDevice *vdev = > > + container_of(vbasedev, VFIOPlatformDevice, vbasedev); > > + > > + vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions); > > + > > + for (i = 0; i < vbasedev->num_regions; i++) { > > + vdev->regions[i] = g_malloc0(sizeof(VFIORegion)); > > + reg_info.index = i; > > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); > > + if (ret) { > > + error_report("vfio: Error getting region %d info: %m", i); > > + goto error; > > + } > > + > > + vdev->regions[i]->flags = reg_info.flags; > > + vdev->regions[i]->size = reg_info.size; > > + vdev->regions[i]->fd_offset = reg_info.offset; > > + vdev->regions[i]->nr = i; > > + vdev->regions[i]->vbasedev = vbasedev; > > + } > > + print_regions(vdev); > > +error: > > + return ret; > > +} > > + > > +/* not implemented yet */ > > +static int vfio_platform_check_device(VFIODevice *vdev) > > +{ > > + return 0; > > +} > > + > > +/* not implemented yet */ > > +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev) > > +{ > > +return false; > > +} > > + > > +/* not implemented yet */ > > +static int vfio_platform_hot_reset_multi(VFIODevice *vdev) > > +{ > > +return 0; > > +} > > + > > +/* > > + * eoi function is called on the first access to any MMIO region > > + * after an IRQ was triggered. It is assumed this access corresponds > > + * to the IRQ status register reset. > > + * With such a mechanism, a single IRQ can be handled at a time since > > + * there is no way to know which IRQ was completed by the guest. > > + * (we would need additional details about the IRQ status register mask) > > + */ > > +static void vfio_platform_eoi(VFIODevice *vbasedev) > > +{ > > + VFIOINTp *intp; > > + VFIOPlatformDevice *vdev = > > + container_of(vbasedev, VFIOPlatformDevice, vbasedev); > > + > > + QLIST_FOREACH(intp, &vdev->intp_list, next) { > > + if (intp->state == VFIO_IRQ_ACTIVE) { > > + DPRINTF("EOI IRQ #%d fd=%d\n", > > + intp->pin, event_notifier_get_fd(&intp->interrupt)); > > + intp->state = VFIO_IRQ_INACTIVE; > > + > > + /* deassert the virtual IRQ and unmask physical one */ > > + qemu_set_irq(intp->qemuirq, 0); > > + vfio_unmask_irqindex(vbasedev, intp->pin); > > + > > + /* a single IRQ can be active at a time */ > > + break; > > + } > > + } > > + > > + /* in case there are pending IRQs, handle them one at a time */ > > + if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) { > > + intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue); > > + vfio_intp_interrupt(intp); We are calling vfio_intp_interrupt() with physical interrupt enabled, while there is a comment in vfio_intp_interrupt() which says physical interrupt is disabled by VFIO. > > + QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext); > > + } > > +} > > + > > +/* > > + * enable/disable the fast path mode > > + * fast path = MMIO region is mmaped (no KVM TRAP) > > + * slow path = MMIO region is trapped and region callbacks are called > > + * slow path enables to trap the IRQ status register guest reset > > +*/ > > + > > +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled) > > +{ > > + VFIORegion *region; > > + int i; > > + > > + DPRINTF("fast path = %d\n", enabled); > > + > > + for (i = 0; i < vdev->vbasedev.num_regions; i++) { > > + region = vdev->regions[i]; > > + > > + /* register space is unmapped to trap EOI */ > > + memory_region_set_enabled(®ion->mmap_mem, enabled); > > + } > > +} > > + > > +/* > > + * Checks whether the IRQ is still pending. In the negative > > + * the fast path mode (where reg space is mmaped) can be restored. > > + * if the IRQ is still pending, we must keep on trapping IRQ status > > + * register reset with mmap disabled (slow path). > > + * the function is called on mmap_timer event. > > + * by construction a single fd is handled at a time. See EOI comment > > + * for additional details. > > + */ > > +static void vfio_intp_mmap_enable(void *opaque) > > +{ > > + VFIOINTp *tmp; > > + VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque; > > + > > + QLIST_FOREACH(tmp, &vdev->intp_list, next) { > > + if (tmp->state == VFIO_IRQ_ACTIVE) { > > + DPRINTF("IRQ #%d still active, stay in slow path\n", > > + tmp->pin); > > + timer_mod(vdev->mmap_timer, > > + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + > > + vdev->mmap_timeout); > > + return; > > + } > > + } > > + DPRINTF("no active IRQ, restore fast path\n"); > > + vfio_mmap_set_enabled(vdev, true); > > +} > > + > > +/* > > + * The fd handler > > + */ > > +void vfio_intp_interrupt(void *opaque) > > +{ > > + int ret; > > + VFIOINTp *tmp, *intp = (VFIOINTp *)opaque; > > + VFIOPlatformDevice *vdev = intp->vdev; > > + bool one_active_irq = false; > > + > > + /* > > + * first check whether there is a pending IRQ > > + * in the positive the new IRQ cannot be handled until the > > + * active one is not completed. > > + * by construction the same IRQ as the pending one cannot hit > > + * since the physical IRQ was disabled by the VFIO driver > > + */ Here we assume physical interrupt disabled. > > + QLIST_FOREACH(tmp, &vdev->intp_list, next) { > > + if (tmp->state == VFIO_IRQ_ACTIVE) { > > + one_active_irq = true; > > + break; > > + } > > + } > > + if (one_active_irq) { > > + /* > > + * the new IRQ gets a pending status and is pushed in > > + * the pending queue > > + */ > > + intp->state = VFIO_IRQ_PENDING; > > + QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue, > > + intp, pqnext); > > + return; > > + } > > + > > + /* no active IRQ, the new IRQ can be forwarded to the guest */ > > + DPRINTF("Handle IRQ #%d (fd = %d)\n", > > + intp->pin, event_notifier_get_fd(&intp->interrupt)); > > + > > + ret = event_notifier_test_and_clear(&intp->interrupt); > > + if (!ret) { > > + DPRINTF("Error when clearing fd=%d\n", > > + event_notifier_get_fd(&intp->interrupt)); > > + } > > + > > + intp->state = VFIO_IRQ_ACTIVE; > > + > > + /* sets slow path */ > > + vfio_mmap_set_enabled(vdev, false); > > + > > + /* trigger the virtual IRQ */ > > + qemu_set_irq(intp->qemuirq, 1); > > + > > + /* schedule the mmap timer which will restore mmap path after EOI*/ > > + if (vdev->mmap_timeout) { > > + timer_mod(vdev->mmap_timer, > > + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + > > + vdev->mmap_timeout); > > + } > > +} > > + > > +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index) > > +{ > > + struct vfio_irq_set *irq_set; > > + int32_t *pfd; > > + int ret, argsz; > > + int device = vbasedev->fd; > > + VFIOPlatformDevice *vdev = > > + container_of(vbasedev, VFIOPlatformDevice, vbasedev); > > + SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev); > > + VFIOINTp *intp; > > + > > + /* allocate and populate a new VFIOINTp structure put in a queue list */ > > + intp = g_malloc0(sizeof(*intp)); > > + intp->vdev = vdev; > > + intp->pin = index; > > + intp->state = VFIO_IRQ_INACTIVE; > > + sysbus_init_irq(sbdev, &intp->qemuirq); > > + > > + ret = event_notifier_init(&intp->interrupt, 0); > > + if (ret) { > > + g_free(intp); > > + error_report("vfio: Error: event_notifier_init failed "); > > + return ret; > > + } > > + > > + /* build the irq_set to be passed to the vfio kernel driver */ > > + argsz = sizeof(*irq_set) + sizeof(*pfd); > > + > > + irq_set = g_malloc0(argsz); > > + irq_set->argsz = argsz; > > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; > > + irq_set->index = index; > > + irq_set->start = 0; > > + irq_set->count = 1; > > + pfd = (int32_t *)&irq_set->data; > > + > > + *pfd = event_notifier_get_fd(&intp->interrupt); > > + > > + DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index); > > + > > + qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp); > > + > > + /* > > + * pass the index/fd binding to the kernel driver so that it > > + * triggers this fd on HW IRQ > > + */ > > + ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set); > > + g_free(irq_set); > > + if (ret) { > > + error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m"); > > + qemu_set_fd_handler(*pfd, NULL, NULL, NULL); > > + event_notifier_cleanup(&intp->interrupt); > > + return -errno; > > + } > > + > > + /* store the new intp in qlist */ > > + QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); > > + return 0; > > +} > > + > > +static int vfio_populate_interrupts(VFIODevice *vbasedev) > > +{ > > + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; > > + int i, ret; > > + VFIOPlatformDevice *vdev = > > + container_of(vbasedev, VFIOPlatformDevice, vbasedev); > > + > > + vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, > > + vfio_intp_mmap_enable, vdev); > > + > > + QSIMPLEQ_INIT(&vdev->pending_intp_queue); > > + > > + for (i = 0; i < vbasedev->num_irqs; i++) { > > + irq.index = i; > > + > > + DPRINTF("Retrieve IRQ info from vfio platform driver ...\n"); > > + > > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); > > + if (ret) { > > + /* This can fail for an old kernel or legacy PCI dev */ > > + error_printf("vfio: error getting device %s irq info", > > + vbasedev->name); > > + } else { > > + DPRINTF("- IRQ index %d: count %d, flags=0x%x\n", > > + irq.index, irq.count, irq.flags); > > + > > + ret = vfio_enable_intp(vbasedev, irq.index); > > + if (ret) { > > + error_report("vfio: Error setting IRQ %d up", i); > > + return ret; > > + } > > + } > > + } > > + return 0; > > +} > > + > > +static VFIODeviceOps vfio_platform_ops = { > > + .vfio_compute_needs_reset = vfio_platform_compute_needs_reset, > > + .vfio_hot_reset_multi = vfio_platform_hot_reset_multi, > > + .vfio_eoi = vfio_platform_eoi, > > + .vfio_check_device = vfio_platform_check_device, > > + .vfio_populate_regions = vfio_populate_regions, > > + .vfio_populate_interrupts = vfio_populate_interrupts, > > +}; > > + > > +static int vfio_base_device_init(VFIODevice *vbasedev) > > +{ > > + VFIOGroup *group; > > + VFIODevice *vbasedev_iter; > > + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; > > + ssize_t len; > > + struct stat st; > > + int groupid; > > + int ret; > > + > > + /* name must be set prior to the call */ > > + if (!vbasedev->name) { > > + return -EINVAL; > > + } > > + > > + /* Check that the host device exists */ > > + snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/", > > + vbasedev->name); > > + > > + if (stat(path, &st) < 0) { > > + error_report("vfio: error: no such host device: %s", path); > > + return -errno; > > + } > > + > > + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); > > + len = readlink(path, iommu_group_path, sizeof(path)); > > + if (len <= 0 || len >= sizeof(path)) { > > + error_report("vfio: error no iommu_group for device"); > > + return len < 0 ? -errno : ENAMETOOLONG; > > + } > > + > > + iommu_group_path[len] = 0; > > + group_name = basename(iommu_group_path); > > + > > + if (sscanf(group_name, "%d", &groupid) != 1) { > > + error_report("vfio: error reading %s: %m", path); > > + return -errno; > > + } > > + > > + DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid); > > + > > + group = vfio_get_group(groupid, &address_space_memory); > > + if (!group) { > > + error_report("vfio: failed to get group %d", groupid); > > + return -ENOENT; > > + } > > + > > + snprintf(path, sizeof(path), "%s", vbasedev->name); > > + > > + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { > > + if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { > > + error_report("vfio: error: device %s is already attached", path); > > + vfio_put_group(group); > > + return -EBUSY; > > + } > > + } > > + ret = vfio_get_device(group, path, vbasedev); > > + if (ret) { > > + error_report("vfio: failed to get device %s", path); > > + vfio_put_group(group); > > + } > > + return ret; > > +} > > + > > +void vfio_put_device(VFIOPlatformDevice *vdev) > > +{ > > + unsigned int i; > > + VFIODevice *vbasedev = &vdev->vbasedev; > > + > > + for (i = 0; i < vbasedev->num_regions; i++) { > > + g_free(vdev->regions[i]); > > + } > > + g_free(vdev->regions); > > + g_free(vdev->vbasedev.name); > > + vfio_put_base_device(&vdev->vbasedev); > > +} > > + > > +static void vfio_platform_realize(DeviceState *dev, Error **errp) > > +{ > > + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); > > + SysBusDevice *sbdev = SYS_BUS_DEVICE(dev); > > + VFIODevice *vbasedev = &vdev->vbasedev; > > + int i, ret; > > + > > + vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; > > + vbasedev->ops = &vfio_platform_ops; > > + > > + DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat); > > + > > + ret = vfio_base_device_init(vbasedev); > > + if (ret) { > > + return; > > + } > > + > > + for (i = 0; i < vbasedev->num_regions; i++) { > > + vfio_map_region(vdev, i); > > + sysbus_init_mmio(sbdev, &vdev->regions[i]->mem); > > + } > > +} > > + > > +static const VMStateDescription vfio_platform_vmstate = { > > + .name = TYPE_VFIO_PLATFORM, > > + .unmigratable = 1, > > +}; > > + > > +static Property vfio_platform_dev_properties[] = { > > + DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name), > > + DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat), > > + DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, > > + mmap_timeout, 1100), > > + DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true), > > + DEFINE_PROP_END_OF_LIST(), > > +}; > > + > > +static void vfio_platform_class_init(ObjectClass *klass, void *data) > > +{ > > + DeviceClass *dc = DEVICE_CLASS(klass); > > + > > + dc->realize = vfio_platform_realize; > > + dc->props = vfio_platform_dev_properties; > > + dc->vmsd = &vfio_platform_vmstate; > > + dc->desc = "VFIO-based platform device assignment"; > > + set_bit(DEVICE_CATEGORY_MISC, dc->categories); > > +} > > + > > +static const TypeInfo vfio_platform_dev_info = { > > + .name = TYPE_VFIO_PLATFORM, > > + .parent = TYPE_SYS_BUS_DEVICE, > > + .instance_size = sizeof(VFIOPlatformDevice), > > + .class_init = vfio_platform_class_init, > > + .class_size = sizeof(VFIOPlatformDeviceClass), > > This should be an abstract class. People must never instantiate a > generic "vfio-platform" device. Only "vfio-xgmac", "vfio-etsec", etc > devices should be exposed to the user. > > > Alex _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm