Just some comments, didn't look at all details. On 2012-08-02 21:17, Alex Williamson wrote: > + > +static int vfio_msix_vector_use(PCIDevice *pdev, > + unsigned int vector, MSIMessage msg) > +{ > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > + int ret, fd; > + > + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__, > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function, vector); > + > + if (vdev->interrupt != INT_MSIX) { > + vfio_disable_interrupts(vdev); > + } > + > + if (!vdev->msi_vectors) { > + vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(MSIVector)); > + } > + > + vdev->msi_vectors[vector].vdev = vdev; > + vdev->msi_vectors[vector].vector = vector; > + vdev->msi_vectors[vector].use = true; > + > + msix_vector_use(pdev, vector); > + > + if (event_notifier_init(&vdev->msi_vectors[vector].interrupt, 0)) { > + error_report("vfio: Error: event_notifier_init failed\n"); > + } > + > + fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt); > + > + /* > + * Attempt to enable route through KVM irqchip, > + * default to userspace handling if unavailable. > + */ > + vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + if (vdev->msi_vectors[vector].virq < 0 || > + kvm_irqchip_add_irqfd(kvm_state, fd, > + vdev->msi_vectors[vector].virq) < 0) { If kvm_irqchip_add_irqfd fails, you have to drop the route and set virq to -1. Otherwise, you won't match with the release logic below. > + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, > + &vdev->msi_vectors[vector]); > + } > + > + /* > + * We don't want to have the host allocate all possible MSI vectors > + * for a device if they're not in use, so we shutdown and incrementally > + * increase them as needed. > + */ > + if (vdev->nr_vectors < vector + 1) { > + int i; > + > + vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX); > + vdev->nr_vectors = vector + 1; > + ret = vfio_enable_vectors(vdev, true); > + if (ret) { > + error_report("vfio: failed to enable vectors, %d\n", ret); > + } > + > + /* We don't know if we've missed interrupts in the interim... */ > + for (i = 0; i < vdev->msix->entries; i++) { > + if (vdev->msi_vectors[i].use) { > + msix_notify(&vdev->pdev, i); > + } > + } And it wasn't possible to provide an interface with VFIO that allows vector addition/removal on the fly? KVM has an aweful one in this regard, but that is legacy, VFIO is new. The above logic is a bit ugly IMHO. > + } else { > + struct vfio_irq_set_fd irq_set_fd = { > + .irq_set = { > + .argsz = sizeof(irq_set_fd), > + .flags = VFIO_IRQ_SET_DATA_EVENTFD | > + VFIO_IRQ_SET_ACTION_TRIGGER, > + .index = VFIO_PCI_MSIX_IRQ_INDEX, > + .start = vector, > + .count = 1, > + }, > + .fd = fd, > + }; > + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd); > + if (ret) { > + error_report("vfio: failed to modify vector, %d\n", ret); > + } > + msix_notify(&vdev->pdev, vector); That injection should no longer be needed once we bounce and record in the PBA, right? Maybe add a comment for now. > + } > + > + return 0; > +} > + > +static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int vector) > +{ > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > + struct vfio_irq_set_fd irq_set_fd = { > + .irq_set = { > + .argsz = sizeof(irq_set_fd), > + .flags = VFIO_IRQ_SET_DATA_EVENTFD | > + VFIO_IRQ_SET_ACTION_TRIGGER, > + .index = VFIO_PCI_MSIX_IRQ_INDEX, > + .start = vector, > + .count = 1, > + }, > + .fd = -1, > + }; > + int fd; > + > + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__, > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function, vector); > + > + /* > + * XXX What's the right thing to do here? This turns off the interrupt > + * completely, but do we really just want to switch the interrupt to > + * bouncing through userspace and let msix.c drop it? Not sure. > + */ > + msix_vector_unuse(pdev, vector); > + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd); > + > + fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt); > + > + if (vdev->msi_vectors[vector].virq < 0) { > + qemu_set_fd_handler(fd, NULL, NULL, NULL); > + } else { > + kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[vector].virq); > + kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq); > + vdev->msi_vectors[vector].virq = -1; > + } > + > + event_notifier_cleanup(&vdev->msi_vectors[vector].interrupt); > + vdev->msi_vectors[vector].use = false; > +} > + > +/* XXX This should move to msi.c */ Nope. We rather need notifier support for MSI. I only have an outdated patch at hand. > +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector) > +{ > + uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS); > + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; > + MSIMessage msg; > + > + if (msi64bit) { > + msg.address = pci_get_quad(pdev->config + > + pdev->msi_cap + PCI_MSI_ADDRESS_LO); > + } else { > + msg.address = pci_get_long(pdev->config + > + pdev->msi_cap + PCI_MSI_ADDRESS_LO); > + } > + > + msg.data = pci_get_word(pdev->config + pdev->msi_cap + > + (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32)); > + msg.data += vector; > + > + return msg; > +} > + > +/* So should this */ > +static void msi_set_qsize(PCIDevice *pdev, uint8_t size) > +{ > + uint8_t *config = pdev->config + pdev->msi_cap; > + uint16_t flags; > + > + flags = pci_get_word(config + PCI_MSI_FLAGS); > + flags = le16_to_cpu(flags); > + flags &= ~PCI_MSI_FLAGS_QSIZE; > + flags |= (size & 0x7) << 4; > + flags = cpu_to_le16(flags); > + pci_set_word(config + PCI_MSI_FLAGS, flags); Hmm... > +} > + > +static void vfio_enable_msi(VFIODevice *vdev) > +{ > + int ret, i; > + > + vfio_disable_interrupts(vdev); > + > + vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); > +retry: > + vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(MSIVector)); > + > + for (i = 0; i < vdev->nr_vectors; i++) { > + MSIMessage msg; > + int fd; > + > + vdev->msi_vectors[i].vdev = vdev; > + vdev->msi_vectors[i].vector = i; > + vdev->msi_vectors[i].use = true; > + > + if (event_notifier_init(&vdev->msi_vectors[i].interrupt, 0)) { > + error_report("vfio: Error: event_notifier_init failed\n"); > + } > + > + fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); > + > + msg = msi_get_msg(&vdev->pdev, i); > + > + /* > + * Attempt to enable route through KVM irqchip, > + * default to userspace handling if unavailable. > + */ > + vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + if (vdev->msi_vectors[i].virq < 0 || > + kvm_irqchip_add_irqfd(kvm_state, fd, > + vdev->msi_vectors[i].virq) < 0) { > + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, > + &vdev->msi_vectors[i]); > + } > + } > + > + ret = vfio_enable_vectors(vdev, false); > + if (ret) { > + if (ret < 0) { > + error_report("vfio: Error: Failed to setup MSI fds: %s\n", > + strerror(errno)); > + } else if (ret != vdev->nr_vectors) { > + error_report("vfio: Error: Failed to enable %d " > + "MSI vectors, retry with %d\n", vdev->nr_vectors, ret); > + } > + > + for (i = 0; i < vdev->nr_vectors; i++) { > + int fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); > + if (vdev->msi_vectors[i].virq >= 0) { > + kvm_irqchip_remove_irqfd(kvm_state, fd, > + vdev->msi_vectors[i].virq); > + kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq); > + vdev->msi_vectors[i].virq = -1; > + } else { > + qemu_set_fd_handler(fd, NULL, NULL, NULL); > + } > + event_notifier_cleanup(&vdev->msi_vectors[i].interrupt); > + } > + > + g_free(vdev->msi_vectors); > + > + if (ret > 0 && ret != vdev->nr_vectors) { > + vdev->nr_vectors = ret; > + goto retry; > + } > + vdev->nr_vectors = 0; > + > + return; > + } > + > + msi_set_qsize(&vdev->pdev, vdev->nr_vectors); Hmmm... Can we really patch qsize? While the guest is already running and possibly evaluated this field before? IOW: Does the spec allow it to be changed by the device and, if yes, in which states? > + > + DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__, > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function, vdev->nr_vectors); > +} > + ... > + > +static int vfio_setup_msi(VFIODevice *vdev, int pos) > +{ > + uint16_t ctrl; > + bool msi_64bit, msi_maskbit; > + int ret, entries; > + > + if (!msi_supported) { Not critical, but I would prefer to keep this variable in the context of the MSI core. msi_init will return ENOTSUP, and you could handle that gracefully. > + return 0; > + } > + > + if (pread(vdev->fd, &ctrl, sizeof(ctrl), > + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { > + return -1; > + } > + ctrl = le16_to_cpu(ctrl); > + > + msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); > + msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); > + entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); > + > + DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain, > + vdev->host.bus, vdev->host.slot, vdev->host.function, pos); > + > + ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); > + if (ret < 0) { > + error_report("vfio: msi_init failed\n"); > + return ret; > + } > + vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); > + > + return 0; > +} > + > +/* > + * We don't have any control over how pci_add_capability() inserts > + * capabilities into the chain. What control is missing precisely? Can pci_add_capability be improved to simplify the early setup? I don't see it (msix_init requires the parameters), but the comment suggests this somehow. > In order to setup MSI-X we need a > + * MemoryRegion for the BAR. In order to setup the BAR and not > + * attempt to mmap the MSI-X table area, which VFIO won't allow, we > + * need to first look for where the MSI-X table lives. So we > + * unfortunately split MSI-X setup across two functions. > + */ > +static int vfio_early_setup_msix(VFIODevice *vdev) > +{ > + uint8_t pos; > + uint16_t ctrl; > + uint32_t table, pba; > + > + pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); > + if (!pos) { > + return 0; > + } > + > + if (pread(vdev->fd, &ctrl, sizeof(ctrl), > + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { > + return -1; > + } > + > + if (pread(vdev->fd, &table, sizeof(table), > + vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { > + return -1; > + } > + > + if (pread(vdev->fd, &pba, sizeof(pba), > + vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { > + return -1; > + } > + > + ctrl = le16_to_cpu(ctrl); > + table = le32_to_cpu(table); > + pba = le32_to_cpu(pba); > + > + vdev->msix = g_malloc0(sizeof(*(vdev->msix))); > + vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; > + vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; > + vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; > + vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; > + vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; > + > + DPRINTF("%04x:%02x:%02x.%x " > + "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n", > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function, pos, vdev->msix->table_bar, > + vdev->msix->table_offset, vdev->msix->entries); > + > + return 0; > +} > + > +static int vfio_setup_msix(VFIODevice *vdev, int pos) > +{ > + int ret; > + > + if (!msi_supported) { See above. > + return 0; > + } > + > + ret = msix_init(&vdev->pdev, vdev->msix->entries, > + &vdev->bars[vdev->msix->table_bar].mem, > + vdev->msix->table_bar, vdev->msix->table_offset, > + &vdev->bars[vdev->msix->pba_bar].mem, > + vdev->msix->pba_bar, vdev->msix->pba_offset, pos); > + if (ret < 0) { > + error_report("vfio: msix_init failed\n"); > + return ret; > + } > + > + ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, > + vfio_msix_vector_release); > + if (ret) { > + error_report("vfio: msix_set_vector_notifiers failed %d\n", ret); > + msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem, > + &vdev->bars[vdev->msix->pba_bar].mem); > + return ret; > + } > + > + return 0; > +} > + > +static void vfio_teardown_msi(VFIODevice *vdev) > +{ > + msi_uninit(&vdev->pdev); > + > + if (vdev->msix) { > + /* FIXME: Why can't unset just silently do nothing?? */ Yep, that would be better. > + if (vdev->pdev.msix_vector_use_notifier && > + vdev->pdev.msix_vector_release_notifier) { > + msix_unset_vector_notifiers(&vdev->pdev); > + } > + > + msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem, > + &vdev->bars[vdev->msix->pba_bar].mem); > + } > +} > + > +/* > + * Resource setup > + */ > +static void vfio_unmap_bar(VFIODevice *vdev, int nr) > +{ > + VFIOBAR *bar = &vdev->bars[nr]; > + > + if (!bar->size) { > + return; > + } > + > + memory_region_del_subregion(&bar->mem, &bar->mmap_mem); > + munmap(bar->mmap, memory_region_size(&bar->mmap_mem)); > + > + if (vdev->msix && vdev->msix->table_bar == nr) { > + memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem); > + munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); > + } > + > + memory_region_destroy(&bar->mem); > +} > + > +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem, > + void **map, size_t size, off_t offset, > + const char *name) > +{ > + int ret = 0; > + > + if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) { > + int prot = 0; > + > + if (bar->flags & VFIO_REGION_INFO_FLAG_READ) { > + prot |= PROT_READ; > + } > + > + if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) { > + prot |= PROT_WRITE; > + } > + > + *map = mmap(NULL, size, prot, MAP_SHARED, > + bar->fd, bar->fd_offset + offset); > + if (*map == MAP_FAILED) { > + *map = NULL; > + ret = -errno; > + goto empty_region; > + } > + > + memory_region_init_ram_ptr(submem, name, size, *map); > + } else { > +empty_region: > + /* Create a zero sized sub-region to make cleanup easy. */ > + memory_region_init(submem, name, 0); > + } > + > + memory_region_add_subregion(mem, offset, submem); > + > + return ret; > +} > + > +static void vfio_map_bar(VFIODevice *vdev, int nr) > +{ > + VFIOBAR *bar = &vdev->bars[nr]; > + unsigned size = bar->size; > + char name[64]; > + uint32_t pci_bar; > + uint8_t type; > + int ret; > + > + /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ > + if (!size) { > + return; > + } > + > + snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d", > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function, nr); > + > + /* Determine what type of BAR this is for registration */ > + ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar), > + vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); > + if (ret != sizeof(pci_bar)) { > + error_report("vfio: Failed to read BAR %d (%s)\n", nr, strerror(errno)); > + return; > + } > + > + pci_bar = le32_to_cpu(pci_bar); > + type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ? > + ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); > + > + /* A "slow" read/write mapping underlies all BARs */ > + memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size); > + pci_register_bar(&vdev->pdev, nr, type, &bar->mem); > + > + /* > + * We can't mmap areas overlapping the MSIX vector table, so we > + * potentially insert a direct-mapped subregion before and after it. > + */ > + if (vdev->msix && vdev->msix->table_bar == nr) { > + size = vdev->msix->table_offset & TARGET_PAGE_MASK; > + } > + > + strncat(name, " mmap", sizeof(name) - strlen(name) - 1); This could generate an unterminated name if we actually have to cut the appended string. You could set name[sizeof(name)-1] = 0. > + if (vfio_mmap_bar(bar, &bar->mem, > + &bar->mmap_mem, &bar->mmap, size, 0, name)) { > + error_report("%s unsupported. Performance may be slow\n", name); > + } > + > + if (vdev->msix && vdev->msix->table_bar == nr) { > + unsigned start; > + > + start = TARGET_PAGE_ALIGN(vdev->msix->table_offset + > + (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); > + > + size = start < bar->size ? bar->size - start : 0; > + strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); Same here. > + /* MSIXInfo contains another MemoryRegion for this mapping */ > + if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem, > + &vdev->msix->mmap, size, start, name)) { > + error_report("%s unsupported. Performance may be slow\n", name); > + } > + } > + > + return; Unneeded return. > +} > + ... > + > +static int vfio_initfn(struct PCIDevice *pdev) > +{ > + VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > + VFIOGroup *group; > + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; > + ssize_t len; > + struct stat st; > + int groupid; > + int ret; > + > + /* Check that the host device exists */ > + snprintf(path, sizeof(path), > + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function); > + if (stat(path, &st) < 0) { > + error_report("vfio: error: no such host device: %s", path); > + return -1; > + } > + > + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); See above for the termination problem. > + > + len = readlink(path, iommu_group_path, PATH_MAX); > + if (len <= 0) { > + error_report("vfio: error no iommu_group for device\n"); > + return -1; > + } > + > + iommu_group_path[len] = 0; > + group_name = basename(iommu_group_path); > + > + if (sscanf(group_name, "%d", &groupid) != 1) { > + error_report("vfio: error reading %s: %s", path, strerror(errno)); > + return -1; > + } > + > + DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain, > + vdev->host.bus, vdev->host.slot, vdev->host.function, groupid); > + > + group = vfio_get_group(groupid); > + if (!group) { > + error_report("vfio: failed to get group %d", groupid); > + return -1; > + } > + > + snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", > + vdev->host.domain, vdev->host.bus, vdev->host.slot, > + vdev->host.function); > + > + QLIST_FOREACH(pvdev, &group->device_list, next) { > + if (pvdev->host.domain == vdev->host.domain && > + pvdev->host.bus == vdev->host.bus && > + pvdev->host.slot == vdev->host.slot && > + pvdev->host.function == vdev->host.function) { > + > + error_report("vfio: error: device %s is already attached\n", path); > + vfio_put_group(group); > + return -1; > + } > + } > + > + ret = vfio_get_device(group, path, vdev); > + if (ret) { > + error_report("vfio: failed to get device %s", path); > + vfio_put_group(group); > + return -1; > + } > + > + /* Get a copy of config space */ > + assert(pci_config_size(&vdev->pdev) <= vdev->config_size); > + ret = pread(vdev->fd, vdev->pdev.config, > + pci_config_size(&vdev->pdev), vdev->config_offset); > + if (ret < (int)pci_config_size(&vdev->pdev)) { > + error_report("vfio: Failed to read device config space\n"); > + goto out_put; > + } > + > + /* > + * Clear host resource mapping info. If we choose not to register a > + * BAR, such as might be the case with the option ROM, we can get > + * confusing, unwritable, residual addresses from the host here. > + */ > + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); > + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); > + > + vfio_load_rom(vdev); > + > + if (vfio_early_setup_msix(vdev)) { > + goto out_put; > + } > + > + vfio_map_bars(vdev); > + > + if (vfio_add_capabilities(vdev)) { > + goto out_teardown; > + } > + > + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { > + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq); > + } > + > + if (vfio_enable_intx(vdev)) { Although vfio_enable_intx also check for PCI_INTERRUPT_PIN, I would move this under the test above - more consistent when reading the code. > + goto out_teardown; > + } > + > + return 0; > + > +out_teardown: > + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); > + vfio_teardown_msi(vdev); > + vfio_unmap_bars(vdev); > +out_put: > + vfio_put_device(vdev); > + vfio_put_group(group); > + return -1; > +} > + > +static void vfio_exitfn(struct PCIDevice *pdev) > +{ > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > + VFIOGroup *group = vdev->group; > + > + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); > + vfio_disable_interrupts(vdev); > + vfio_teardown_msi(vdev); > + vfio_unmap_bars(vdev); > + vfio_put_device(vdev); > + vfio_put_group(group); > +} > + > +static void vfio_reset(DeviceState *dev) > +{ > + PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > + > + if (!vdev->reset_works) { > + return; > + } > + > + if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) { > + error_report("vfio: Error unable to reset physical device " > + "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain, > + vdev->host.bus, vdev->host.slot, vdev->host.function, > + strerror(errno)); > + } > +} > + > +static Property vfio_pci_dev_properties[] = { > + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host), > + /* > + * TODO - support passed fds... is this necessary? > + * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name), > + * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name), > + */ > + DEFINE_PROP_END_OF_LIST(), > +}; > + > + > +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > +{ > + PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass); > + > + dc->parent_class.reset = vfio_reset; > + dc->init = vfio_initfn; > + dc->exit = vfio_exitfn; > + dc->config_read = vfio_pci_read_config; > + dc->config_write = vfio_pci_write_config; > + dc->parent_class.props = vfio_pci_dev_properties; > +} > + > +static TypeInfo vfio_pci_dev_info = { > + .name = "vfio-pci", > + .parent = TYPE_PCI_DEVICE, > + .instance_size = sizeof(VFIODevice), > + .class_init = vfio_pci_dev_class_init, > +}; > + > +static void register_vfio_pci_dev_type(void) > +{ > + type_register_static(&vfio_pci_dev_info); > +} > + > +type_init(register_vfio_pci_dev_type) > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h > new file mode 100644 > index 0000000..0a71bce > --- /dev/null > +++ b/hw/vfio_pci.h > @@ -0,0 +1,101 @@ > +#ifndef HW_VFIO_PCI_H > +#define HW_VFIO_PCI_H > + > +#include "qemu-common.h" > +#include "qemu-queue.h" > +#include "pci.h" > +#include "event_notifier.h" > + > +typedef struct VFIOBAR { > + off_t fd_offset; /* offset of BAR within device fd */ > + int fd; /* device fd, allows us to pass VFIOBAR as opaque data */ > + MemoryRegion mem; /* slow, read/write access */ > + MemoryRegion mmap_mem; /* direct mapped access */ > + void *mmap; > + size_t size; > + uint32_t flags; /* VFIO region flags (rd/wr/mmap) */ > + uint8_t nr; /* cache the BAR number for debug */ > +} VFIOBAR; > + > +typedef struct INTx { > + bool pending; /* interrupt pending */ > + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ > + uint8_t pin; /* which pin to pull for qemu_set_irq */ > + EventNotifier interrupt; /* eventfd triggered on interrupt */ > + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ > + PCIINTxRoute route; /* routing info for QEMU bypass */ > +} INTx; Please add a VFIO prefix. > + > +struct VFIODevice; > + > +typedef struct MSIVector { > + EventNotifier interrupt; /* eventfd triggered on interrupt */ > + struct VFIODevice *vdev; /* back pointer to device */ > + int vector; /* the vector number for this element */ Could also be calculated via vector - vector->vdev->msi_vectors. But I don't mind. > + int virq; /* KVM irqchip route for QEMU bypass */ > + bool use; > +} MSIVector; Also here. Just in case we ever decide to introduce a generic structure with this name. > + > +enum { > + INT_NONE = 0, > + INT_INTx = 1, > + INT_MSI = 2, > + INT_MSIX = 3, > +}; > + > +struct VFIOGroup; > + > +typedef struct VFIOContainer { > + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ > + struct { > + /* enable abstraction to support various iommu backends */ > + union { > + MemoryListener listener; /* Used by type1 iommu */ > + }; > + void (*release)(struct VFIOContainer *); > + } iommu_data; > + QLIST_HEAD(, VFIOGroup) group_list; > + QLIST_ENTRY(VFIOContainer) next; > +} VFIOContainer; > + > +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ > +typedef struct MSIXInfo { > + uint8_t table_bar; > + uint8_t pba_bar; > + uint16_t entries; > + uint32_t table_offset; > + uint32_t pba_offset; > + MemoryRegion mmap_mem; > + void *mmap; > +} MSIXInfo; Also a pretty generic name. > + > +typedef struct VFIODevice { > + PCIDevice pdev; > + int fd; > + INTx intx; > + unsigned int config_size; > + off_t config_offset; /* Offset of config space region within device fd */ > + unsigned int rom_size; > + off_t rom_offset; /* Offset of ROM region within device fd */ > + int msi_cap_size; > + MSIVector *msi_vectors; > + MSIXInfo *msix; > + int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ > + int interrupt; /* Current interrupt type */ > + VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > + PCIHostDeviceAddress host; > + QLIST_ENTRY(VFIODevice) next; > + struct VFIOGroup *group; > + bool reset_works; > +} VFIODevice; > + > +typedef struct VFIOGroup { > + int fd; > + int groupid; > + VFIOContainer *container; > + QLIST_HEAD(, VFIODevice) device_list; > + QLIST_ENTRY(VFIOGroup) next; > + QLIST_ENTRY(VFIOGroup) container_next; > +} VFIOGroup; > + > +#endif /* HW_VFIO_PCI_H */ > Why do all theses structs have to go into a header file? Will there be more users than vfio_pci.c? Jan -- Siemens AG, Corporate Technology, CT RTC ITP SDP-DE Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html