On Mon, 2012-08-13 at 17:18 -0500, Anthony Liguori wrote: > Alex Williamson <alex.williamson@xxxxxxxxxx> writes: > > +static int vfio_load_rom(VFIODevice *vdev) > > +{ > > + uint64_t size = vdev->rom_size; > > + const VMStateDescription *vmsd; > > + char name[32]; > > + off_t off = 0, voff = vdev->rom_offset; > > + ssize_t bytes; > > + void *ptr; > > + > > + /* If loading ROM from file, pci handles it */ > > + if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) { > > + return 0; > > + } > > + > > + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, > > + vdev->host.bus, vdev->host.slot, vdev->host.function); > > + > > + vmsd = qdev_get_vmsd(DEVICE(&vdev->pdev)); > > + > > + if (vmsd) { > > + snprintf(name, sizeof(name), "%s.rom", vmsd->name); > > + } else { > > + snprintf(name, sizeof(name), "%s.rom", > > + object_get_typename(OBJECT(&vdev->pdev))); > > + } > > Not sure where this came from. You can just hard code this to vfio.rom > or better yet, use the pci-host address and do vfio[%s].rom. Ok, assume you mean vfio[%04x:%02x:%02x.%x].rom or should I be calling object_property_print() to get a %s? > > + memory_region_init_ram(&vdev->pdev.rom, name, size); > > + ptr = memory_region_get_ram_ptr(&vdev->pdev.rom); > > + memset(ptr, 0xff, size); > > + > > + while (size) { > > + bytes = pread(vdev->fd, ptr + off, size, voff + off); > > + if (bytes == 0) { > > + break; /* expect that we could get back less than the ROM BAR */ > > + } else if (bytes > 0) { > > + off += bytes; > > + size -= bytes; > > + } else { > > + if (errno == EINTR || errno == EAGAIN) { > > + continue; > > + } > > + error_report("vfio: Error reading device ROM: %s\n", > > + strerror(errno)); > > + memory_region_destroy(&vdev->pdev.rom); > > + return -1; > > + } > > + } > > + > > + pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom); > > + vdev->pdev.has_rom = true; > > + return 0; > > +} > > + > > +static int vfio_connect_container(VFIOGroup *group) > > +{ > > + VFIOContainer *container; > > + int ret, fd; > > + > > + if (group->container) { > > + return 0; > > + } > > + > > + QLIST_FOREACH(container, &container_list, next) { > > + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { > > + group->container = container; > > + QLIST_INSERT_HEAD(&container->group_list, group, container_next); > > + return 0; > > + } > > + } > > + > > + fd = qemu_open("/dev/vfio/vfio", O_RDWR); > > + if (fd < 0) { > > + error_report("vfio: failed to open /dev/vfio/vfio: %s\n", > > + strerror(errno)); > > + return -1; > > + } > > + > > + ret = ioctl(fd, VFIO_GET_API_VERSION); > > + if (ret != VFIO_API_VERSION) { > > + error_report("vfio: supported vfio version: %d, " > > + "reported version: %d\n", VFIO_API_VERSION, ret); > > + close(fd); > > + return -1; > > + } > > + > > + container = g_malloc0(sizeof(*container)); > > + container->fd = fd; > > + > > + if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { > > + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); > > + if (ret) { > > + error_report("vfio: failed to set group container: %s\n", > > + strerror(errno)); > > + g_free(container); > > + close(fd); > > + return -1; > > + } > > + > > + ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); > > + if (ret) { > > + error_report("vfio: failed to set iommu for container: %s\n", > > + strerror(errno)); > > + g_free(container); > > + close(fd); > > + return -1; > > + } > > + > > + container->iommu_data.listener = (MemoryListener) { > > + .begin = vfio_listener_dummy1, > > + .commit = vfio_listener_dummy1, > > + .region_add = vfio_listener_region_add, > > + .region_del = vfio_listener_region_del, > > + .region_nop = vfio_listener_dummy2, > > + .log_start = vfio_listener_dummy2, > > + .log_stop = vfio_listener_dummy2, > > + .log_sync = vfio_listener_dummy2, > > + .log_global_start = vfio_listener_dummy1, > > + .log_global_stop = vfio_listener_dummy1, > > + .eventfd_add = vfio_listener_dummy3, > > + .eventfd_del = vfio_listener_dummy3, > > + }; > > It would be nicer to move this out to a static structure. Ok > > + container->iommu_data.release = vfio_listener_release; > > + > > + memory_listener_register(&container->iommu_data.listener, > > + get_system_memory()); > > + } else { > > + error_report("vfio: No available IOMMU models\n"); > > + g_free(container); > > + close(fd); > > + return -1; > > + } > > + > > + QLIST_INIT(&container->group_list); > > + QLIST_INSERT_HEAD(&container_list, container, next); > > + > > + group->container = container; > > + QLIST_INSERT_HEAD(&container->group_list, group, container_next); > > + > > + return 0; > > +} > > + > > +static void vfio_disconnect_container(VFIOGroup *group) > > +{ > > + VFIOContainer *container = group->container; > > + > > + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { > > + error_report("vfio: error disconnecting group %d from container\n", > > + group->groupid); > > + } > > error_report isn't terminal. Not meant to be here, the kernel side will prevent the group from being re-used. I suppose there's the question of whether we want to continue to get further out of sync with the kernel by continuing the below disconnects though. Either way seems to be a funky state and I'm not sure I have a strong preference. It seems unnecessary to kill the guest though. > > + > > + QLIST_REMOVE(group, container_next); > > + group->container = NULL; > > + > > + if (QLIST_EMPTY(&container->group_list)) { > > + if (container->iommu_data.release) { > > + container->iommu_data.release(container); > > + } > > + QLIST_REMOVE(container, next); > > + DPRINTF("vfio_disconnect_container: close container->fd\n"); > > + close(container->fd); > > + g_free(container); > > + } > > +} > > + > > +static VFIOGroup *vfio_get_group(int groupid) > > +{ > > + VFIOGroup *group; > > + char path[32]; > > + struct vfio_group_status status = { .argsz = sizeof(status) }; > > + > > + QLIST_FOREACH(group, &group_list, next) { > > + if (group->groupid == groupid) { > > + return group; > > + } > > + } > > + > > + group = g_malloc0(sizeof(*group)); > > + > > + snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); > > + group->fd = qemu_open(path, O_RDWR); > > + if (group->fd < 0) { > > + error_report("vfio: error opening %s: %s", path, strerror(errno)); > > + g_free(group); > > + return NULL; > > + } > > + > > + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { > > + error_report("vfio: error getting group status: %s\n", > > + strerror(errno)); > > + close(group->fd); > > + g_free(group); > > + return NULL; > > + } > > + > > + if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { > > + error_report("vfio: error, group %d is not viable, please ensure " > > + "all devices within the iommu_group are bound to their " > > + "vfio bus driver.\n", groupid); > > + close(group->fd); > > + g_free(group); > > + return NULL; > > + } > > + > > + group->groupid = groupid; > > + QLIST_INIT(&group->device_list); > > + > > + if (vfio_connect_container(group)) { > > + error_report("vfio: failed to setup container for group %d\n", groupid); > > + close(group->fd); > > + g_free(group); > > + return NULL; > > + } > > + > > + QLIST_INSERT_HEAD(&group_list, group, next); > > + > > + return group; > > +} > > + > > +static void vfio_put_group(VFIOGroup *group) > > +{ > > + if (!QLIST_EMPTY(&group->device_list)) { > > + return; > > + } > > + > > + vfio_disconnect_container(group); > > + QLIST_REMOVE(group, next); > > + DPRINTF("vfio_put_group: close group->fd\n"); > > + close(group->fd); > > + g_free(group); > > +} > > + > > +static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev) > > +{ > > + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; > > + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; > > + int ret, i; > > + > > + ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); > > + if (ret < 0) { > > + error_report("vfio: error getting device %s from group %d: %s", > > + name, group->groupid, strerror(errno)); > > + error_report("Verify all devices in group %d " > > + "are bound to vfio-pci or pci-stub and not already in use", > > + group->groupid); > > + return ret; > > + } > > + > > + vdev->fd = ret; > > + vdev->group = group; > > + QLIST_INSERT_HEAD(&group->device_list, vdev, next); > > + > > + /* Sanity check device */ > > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info); > > + if (ret) { > > + error_report("vfio: error getting device info: %s", strerror(errno)); > > + goto error; > > + } > > + > > + DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name, > > + dev_info.flags, dev_info.num_regions, dev_info.num_irqs); > > + > > + if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) { > > + error_report("vfio: Um, this isn't a PCI device"); > > + goto error; > > + } > > + > > + vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); > > + if (!vdev->reset_works) { > > + error_report("Warning, device %s does not support reset\n", name); > > + } > > + > > + if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) { > > + error_report("vfio: unexpected number of io regions %u", > > + dev_info.num_regions); > > + goto error; > > + } > > + > > + if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) { > > + error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs); > > + goto error; > > + } > > + > > + for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { > > + reg_info.index = i; > > + > > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); > > + if (ret) { > > + error_report("vfio: Error getting region %d info: %s", i, > > + strerror(errno)); > > + goto error; > > + } > > + > > + DPRINTF("Device %s region %d:\n", name, i); > > + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", > > + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, > > + (unsigned long)reg_info.flags); > > + > > + vdev->bars[i].flags = reg_info.flags; > > + vdev->bars[i].size = reg_info.size; > > + vdev->bars[i].fd_offset = reg_info.offset; > > + vdev->bars[i].fd = vdev->fd; > > + vdev->bars[i].nr = i; > > + } > > + > > + reg_info.index = VFIO_PCI_ROM_REGION_INDEX; > > + > > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); > > + if (ret) { > > + error_report("vfio: Error getting ROM info: %s", strerror(errno)); > > + goto error; > > + } > > + > > + DPRINTF("Device %s ROM:\n", name); > > + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", > > + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, > > + (unsigned long)reg_info.flags); > > + > > + vdev->rom_size = reg_info.size; > > + vdev->rom_offset = reg_info.offset; > > + > > + reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; > > + > > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); > > + if (ret) { > > + error_report("vfio: Error getting config info: %s", strerror(errno)); > > + goto error; > > + } > > + > > + DPRINTF("Device %s config:\n", name); > > + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", > > + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, > > + (unsigned long)reg_info.flags); > > + > > + vdev->config_size = reg_info.size; > > + vdev->config_offset = reg_info.offset; > > + > > +error: > > + if (ret) { > > + QLIST_REMOVE(vdev, next); > > + vdev->group = NULL; > > + close(vdev->fd); > > + } > > + return ret; > > +} > > + > > +static void vfio_put_device(VFIODevice *vdev) > > +{ > > + QLIST_REMOVE(vdev, next); > > + vdev->group = NULL; > > + DPRINTF("vfio_put_device: close vdev->fd\n"); > > + close(vdev->fd); > > + if (vdev->msix) { > > + g_free(vdev->msix); > > + vdev->msix = NULL; > > + } > > +} > > + > > +static int vfio_initfn(struct PCIDevice *pdev) > > Why struct PCIDevice? Must have pasted it from somewhere, fixed. > > +{ > > + VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > > PCI_DEVICE() I'm not sure what you're after here, are you perhaps mistaking pdev for qdev? > > + VFIOGroup *group; > > + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; > > + ssize_t len; > > + struct stat st; > > + int groupid; > > + int ret; > > + > > + /* Check that the host device exists */ > > + snprintf(path, sizeof(path), > > + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", > > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > > + vdev->host.function); > > + if (stat(path, &st) < 0) { > > + error_report("vfio: error: no such host device: %s", path); > > + return -1; > > + } > > + > > + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); > > + > > + len = readlink(path, iommu_group_path, PATH_MAX); > > + if (len <= 0) { > > + error_report("vfio: error no iommu_group for device\n"); > > + return -1; > > + } > > + > > + iommu_group_path[len] = 0; > > + group_name = basename(iommu_group_path); > > + > > + if (sscanf(group_name, "%d", &groupid) != 1) { > > + error_report("vfio: error reading %s: %s", path, strerror(errno)); > > + return -1; > > + } > > + > > + DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain, > > + vdev->host.bus, vdev->host.slot, vdev->host.function, groupid); > > + > > + group = vfio_get_group(groupid); > > + if (!group) { > > + error_report("vfio: failed to get group %d", groupid); > > + return -1; > > + } > > + > > + snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", > > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > > + vdev->host.function); > > + > > + QLIST_FOREACH(pvdev, &group->device_list, next) { > > + if (pvdev->host.domain == vdev->host.domain && > > + pvdev->host.bus == vdev->host.bus && > > + pvdev->host.slot == vdev->host.slot && > > + pvdev->host.function == vdev->host.function) { > > + > > + error_report("vfio: error: device %s is already attached\n", path); > > + vfio_put_group(group); > > + return -1; > > + } > > + } > > + > > + ret = vfio_get_device(group, path, vdev); > > + if (ret) { > > + error_report("vfio: failed to get device %s", path); > > + vfio_put_group(group); > > + return -1; > > + } > > + > > + /* Get a copy of config space */ > > + assert(pci_config_size(&vdev->pdev) <= vdev->config_size); > > + ret = pread(vdev->fd, vdev->pdev.config, > > + pci_config_size(&vdev->pdev), vdev->config_offset); > > + if (ret < (int)pci_config_size(&vdev->pdev)) { > > + error_report("vfio: Failed to read device config space\n"); > > + goto out_put; > > + } > > + > > + /* > > + * Clear host resource mapping info. If we choose not to register a > > + * BAR, such as might be the case with the option ROM, we can get > > + * confusing, unwritable, residual addresses from the host here. > > + */ > > + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); > > + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); > > + > > + vfio_load_rom(vdev); > > + > > + if (vfio_early_setup_msix(vdev)) { > > + goto out_put; > > + } > > + > > + vfio_map_bars(vdev); > > + > > + if (vfio_add_capabilities(vdev)) { > > + goto out_teardown; > > + } > > + > > + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { > > + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq); > > + } > > + > > + if (vfio_enable_intx(vdev)) { > > + goto out_teardown; > > + } > > + > > + return 0; > > + > > +out_teardown: > > + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); > > + vfio_teardown_msi(vdev); > > + vfio_unmap_bars(vdev); > > +out_put: > > + vfio_put_device(vdev); > > + vfio_put_group(group); > > + return -1; > > +} > > + > > +static void vfio_exitfn(struct PCIDevice *pdev) > > +{ > > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > > + VFIOGroup *group = vdev->group; > > + > > + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); > > + vfio_disable_interrupts(vdev); > > + vfio_teardown_msi(vdev); > > + vfio_unmap_bars(vdev); > > + vfio_put_device(vdev); > > + vfio_put_group(group); > > You should move this all to the destructor (instance_finalize). Hmm, it looks like if I do that then pci will free my interrupt and config space out from under me before I get any notice we're killing the device. Being a pci device, I think I'm tied to PCIDeviceClass.exit function, right? > > +} > > + > > +static void vfio_reset(DeviceState *dev) > > +{ > > + PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); > > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > > + > > + if (!vdev->reset_works) { > > + return; > > + } > > + > > + if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) { > > + error_report("vfio: Error unable to reset physical device " > > + "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain, > > + vdev->host.bus, vdev->host.slot, vdev->host.function, > > + strerror(errno)); > > %m is thread safe, strerror isn't. Neat. Fixed throughout. > > + } > > +} > > + > > +static Property vfio_pci_dev_properties[] = { > > + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host), > > + /* > > + * TODO - support passed fds... is this necessary? > > + * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name), > > + * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name), > > + */ > > + DEFINE_PROP_END_OF_LIST(), > > +}; > > + > > + > > +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > > +{ > > + PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass); > > + > > + dc->parent_class.reset = vfio_reset; > > This is definitely not right. You want to do: > > PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); > DeviceClass *dc = DEVICE_CLASS(klass); > > dc->reset = vfio_reset; > > > + dc->init = vfio_initfn; > > + dc->exit = vfio_exitfn; > > + dc->config_read = vfio_pci_read_config; > > + dc->config_write = vfio_pci_write_config; > > + dc->parent_class.props = vfio_pci_dev_properties; > > dc->props = vfio_pci_dev_properties; Ok, fixed. > > +} > > + > > +static TypeInfo vfio_pci_dev_info = { > > + .name = "vfio-pci", > > + .parent = TYPE_PCI_DEVICE, > > + .instance_size = sizeof(VFIODevice), > > + .class_init = vfio_pci_dev_class_init, > > +}; > > + > > +static void register_vfio_pci_dev_type(void) > > +{ > > + type_register_static(&vfio_pci_dev_info); > > +} > > + > > +type_init(register_vfio_pci_dev_type) > > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h > > new file mode 100644 > > index 0000000..0a71bce > > --- /dev/null > > +++ b/hw/vfio_pci.h > > @@ -0,0 +1,101 @@ > > copyright/license. > > > +#ifndef HW_VFIO_PCI_H > > +#define HW_VFIO_PCI_H > > + > > +#include "qemu-common.h" > > +#include "qemu-queue.h" > > +#include "pci.h" > > +#include "event_notifier.h" > > This is all private to vfio.c, right? Perhaps call it vfio_pci_int.h Yes, it is private. Ok, renamed. Thanks for the review, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html