Add virtual MSI-X tables for PCI devices, and create IRQFD routes to let the kernel inject MSIs from a physical device directly into the guest. Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> --- include/kvm/vfio.h | 24 ++++ vfio.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 369 insertions(+), 21 deletions(-) diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h index 6d2666b0..68535963 100644 --- a/include/kvm/vfio.h +++ b/include/kvm/vfio.h @@ -8,8 +8,32 @@ #define MAX_VFIO_GROUPS 16 +struct vfio_pci_msix_entry { + struct msix_table config; + int gsi; + int eventfd; +}; + +struct vfio_pci_msix_table { + size_t nr_entries; + size_t size; + unsigned int bar; + u32 guest_phys_addr; + struct vfio_pci_msix_entry *entries; +}; + +struct vfio_pci_msix_pba { + size_t size; + off_t offset; /* in VFIO device fd */ + unsigned int bar; + u32 guest_phys_addr; +}; + struct vfio_pci_device { struct pci_device_header hdr; + + struct vfio_pci_msix_table msix_table; + struct vfio_pci_msix_pba msix_pba; }; struct vfio_region { diff --git a/vfio.c b/vfio.c index 0f5bc3dd..85d1ea8b 100644 --- a/vfio.c +++ b/vfio.c @@ -50,6 +50,70 @@ int vfio_group_parser(const struct option *opt, const char *arg, int unset) return 0; } +static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, + u32 len, u8 is_write, void *ptr) +{ + struct vfio_pci_device *pdev = ptr; + struct vfio_pci_msix_pba *pba = &pdev->msix_pba; + u64 offset = addr - pba->guest_phys_addr; + struct vfio_device *device = container_of(pdev, struct vfio_device, pci); + + if (is_write) + return; + + if (pread(device->fd, data, len, pba->offset + offset) != len) + pr_err("cannot access MSIX PBA\n"); +} + +static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, + u32 len, u8 is_write, void *ptr) +{ + struct kvm *kvm = vcpu->kvm; + struct vfio_pci_device *pdev = ptr; + struct vfio_pci_msix_entry *entry; + struct vfio_pci_msix_table *table = &pdev->msix_table; + struct vfio_device *device = container_of(pdev, struct vfio_device, pci); + + u64 offset = addr - table->guest_phys_addr; + + size_t vector = offset / PCI_MSIX_ENTRY_SIZE; + /* PCI spec says that software must use aligned 4 or 8 bytes accesses */ + off_t field = offset % PCI_MSIX_ENTRY_SIZE; + entry = &table->entries[vector]; + + if (!is_write) { + memcpy(data, (void *)&entry->config + field, len); + return; + } + + memcpy((void *)&entry->config + field, data, len); + + if (field != PCI_MSIX_ENTRY_VECTOR_CTRL) + return; + + if (entry->gsi < 0) { + int ret = irq__add_msix_route(kvm, &entry->config.msg, + device->dev_hdr.dev_num << 3); + if (ret < 0) { + pr_err("cannot create MSI-X route"); + } else { + entry->gsi = ret; + + ret = irq__add_irqfd(kvm, ret, entry->eventfd, -1); + if (ret < 0) + pr_err("Cannot setup irqfd"); + } + + if (ret < 0) + /* Not much we can do here. Mask the vector. */ + entry->config.ctrl = 1; + + return; + } + + irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); +} + static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, u8 offset, void *data, int sz) { @@ -89,17 +153,94 @@ static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hd sz, offset); } +static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) +{ + switch (cap_hdr->type) { + case PCI_CAP_ID_MSIX: + return PCI_CAP_MSIX_SIZEOF; + default: + pr_err("unknown PCI capability %u", cap_hdr->type); + return 0; + } +} + +/* + * Copy capability from physical header into virtual header, and add it to the + * virtual capability list. + * + * @fd_offset: offset of pci header into vfio device fd + * @pos: offset of capability from start of header + */ +static int vfio_pci_add_cap(struct vfio_device *device, struct pci_cap_hdr *cap_hdr, + off_t fd_offset, off_t pos) +{ + int i; + ssize_t size = vfio_pci_cap_size(cap_hdr); + struct pci_device_header *hdr = &device->pci.hdr; + struct pci_cap_hdr *out = (void *)hdr + pos; + + if (pread(device->fd, out, size, fd_offset + pos) != size) + return -errno; + + out->next = 0; + + if (!hdr->capabilities) { + hdr->capabilities = pos; + hdr->status |= PCI_STATUS_CAP_LIST; + } else { + /* Add cap at end of list */ + struct pci_cap_hdr *last; + + pci_for_each_cap(i, last, hdr) + ; + last->next = pos; + } + + return 0; +} + static int vfio_pci_parse_caps(struct vfio_device *device) { + u8 pos; + int ret; + struct pci_cap_hdr cap; + ssize_t sz = sizeof(cap); + struct vfio_region_info *info; struct vfio_pci_device *pdev = &device->pci; if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) return 0; + pos = pdev->hdr.capabilities & ~3; + info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; + pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; pdev->hdr.capabilities = 0; - /* TODO: install virtual capabilities */ + for (; pos; pos = cap.next) { + if (pos >= PCI_DEV_CFG_SIZE) { + pr_warning("Ignoring cap outside of config space"); + return -EINVAL; + } + + if (pread(device->fd, &cap, sz, info->offset + pos) != sz) { + pr_warning("Failed to read from capabilities pointer (0x%x)", + pos); + return -EINVAL; + } + + switch (cap.type) { + case PCI_CAP_ID_MSIX: + ret = vfio_pci_add_cap(device, &cap, info->offset, pos); + if (ret) { + pr_warning("Failed to read MSI-X capability structure"); + return ret; + } + break; + + /* Any other capability is hidden */ + } + } return 0; } @@ -150,7 +291,11 @@ static int vfio_pci_parse_cfg_space(struct vfio_device *device) static int vfio_pci_fixup_cfg_space(struct vfio_device *device) { int i; + int pos; ssize_t hdr_sz; + ssize_t cap_sz; + struct pci_cap_hdr *cap; + struct msix_cap *msix; struct vfio_region_info *info; struct vfio_pci_device *pdev = &device->pci; @@ -183,6 +328,22 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *device) */ pdev->hdr.exp_rom_bar = 0; + /* Plumb in our fake MSI-X capability, if we have it. */ + msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); + if (msix) { + /* Add a shortcut to the PBA region for the MMIO handler */ + int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; + pdev->msix_pba.offset = device->regions[pba_index].info.offset + + (msix->pba_offset & PCI_MSIX_PBA_OFFSET); + + /* Tidy up the capability */ + msix->table_offset &= PCI_MSIX_TABLE_BIR; + msix->pba_offset &= PCI_MSIX_PBA_BIR; + if (pdev->msix_table.bar == pdev->msix_pba.bar) + msix->pba_offset |= pdev->msix_table.size & + PCI_MSIX_PBA_OFFSET; + } + /* Install our fake Configuration Space, without the caps */ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; hdr_sz = offsetof(struct pci_device_header, msix); @@ -191,7 +352,17 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *device) return -EIO; } - /* TODO: install virtual capabilities */ + /* Install the fake capability list */ + pci_for_each_cap(pos, cap, &pdev->hdr) { + cap_sz = vfio_pci_cap_size(cap); + + if (pwrite(device->fd, cap, cap_sz, info->offset + pos) != + cap_sz) { + pr_err("Failed to write capability %u", cap->type); + return -EIO; + } + } + /* Register callbacks for cfg accesses */ pdev->hdr.cfg_ops = (struct pci_config_operations) { .read = vfio_pci_cfg_read, @@ -250,16 +421,97 @@ static int vfio_pci_map_bar(struct kvm *kvm, int fd, struct vfio_region *region) return 0; } +static int vfio_pci_create_msix_table(struct kvm *kvm, + struct vfio_pci_device *pdev, + struct msix_cap *msix) +{ + int ret; + size_t i; + size_t nr_entries; + size_t table_size; + struct vfio_pci_msix_pba *pba = &pdev->msix_pba; + struct vfio_pci_msix_table *table = &pdev->msix_table; + + table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; + pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; + + /* + * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE. + */ + nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; + table_size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE); + + table->entries = calloc(nr_entries, sizeof(struct vfio_pci_msix_entry)); + if (!table->entries) + return -ENOMEM; + + for (i = 0; i < nr_entries; i++) + table->entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; + + table->nr_entries = nr_entries; + table->size = table_size; + + /* + * To ease MSI-X cap configuration in case they share the same BAR, + * collapse table and pending array. According to PCI, address spaces + * must be power of two. Since nr_entries is a power of two, and PBA + * size is less than table_size, reserve 2*table_size. + */ + table->guest_phys_addr = pci_get_io_space_block(2 * table_size); + if (!table->guest_phys_addr) { + pr_err("cannot allocate IO space"); + ret = -ENOMEM; + goto out_free; + } + pba->guest_phys_addr = table->guest_phys_addr + table->size; + + ret = kvm__register_mmio(kvm, table->guest_phys_addr, table_size, false, + vfio_pci_msix_table_access, pdev); + if (ret < 0) + goto out_free; + + /* + * We could map the physical PBA directly into the guest, but it's + * likely smaller than a page, and we can only hand full pages to the + * guest. Even though the PCI spec disallows sharing a page used for + * MSI-X with any other resource, it allows to share the same page + * between MSI-X table and PBA. For the sake of isolation, create a + * virtual PBA. + */ + pba->size = nr_entries / 8; + + ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false, + vfio_pci_msix_pba_access, pdev); + if (ret < 0) + goto out_free; + + return 0; + +out_free: + free(table->entries); + + return ret; +} + static int vfio_pci_configure_dev_regions(struct kvm *kvm, struct vfio_device *device) { int ret; + struct msix_cap *msix; + struct vfio_pci_device *pdev = &device->pci; u32 i, num_regions = device->info.num_regions; ret = vfio_pci_parse_cfg_space(device); if (ret) return ret; + msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); + if (msix) { + ret = vfio_pci_create_msix_table(kvm, pdev, msix); + if (ret) + return ret; + } + /* First of all, map the BARs directly into the guest */ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { struct vfio_region *region; @@ -278,6 +530,16 @@ static int vfio_pci_configure_dev_regions(struct kvm *kvm, if (!region->info.size) continue; + if (msix) { + if (i == pdev->msix_table.bar) { + region->guest_phys_addr = pdev->msix_table.guest_phys_addr; + continue; + } else if (i == pdev->msix_pba.bar) { + region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; + continue; + } + } + /* * Map the BARs into the guest. We'll later need to update * configuration space to reflect our allocation. @@ -314,6 +576,64 @@ static int vfio_configure_dev_regions(struct kvm *kvm, return vfio_pci_configure_dev_regions(kvm, device); } +static int vfio_pci_init_msix_irqfd(struct kvm *kvm, + struct vfio_device *device) +{ + int ret; + size_t i; + int *eventfds; + size_t irq_set_size; + struct vfio_irq_set *irq_set; + struct vfio_pci_msix_table *table = &device->pci.msix_table; + + /* + * We likely have VFIO_IRQ_INFO_NORESIZE for MSI-X, and we don't want to + * enable/disable MSIs every time the guest requests a new one. Setup + * IRQFD for all vectors upfront. + * + * We cannot start creating the MSI-X routes in KVM just now. First we + * need to wait for all devices to allocate their IRQ lines, and only + * after that number is freezed will we be able to allocate MSI numbers. + * A bit unfortunate (it would be much easier to handle initialization + * errors here), but okay. Store eventfd until we're ready to create the + * routes. + */ + irq_set_size = sizeof(struct vfio_irq_set) + + table->nr_entries * sizeof(int); + irq_set = malloc(irq_set_size); + if (!irq_set) + return -ENOMEM; + + *irq_set = (struct vfio_irq_set) { + .argsz = irq_set_size, + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = VFIO_PCI_MSIX_IRQ_INDEX, + .start = 0, + .count = table->nr_entries, + }; + eventfds = (void *)irq_set + sizeof(struct vfio_irq_set); + + for (i = 0; i < table->nr_entries; i++) { + eventfds[i] = eventfd(0, 0); + if (eventfds[i] < 0) { + pr_err("cannot create eventfd (try to increase RLIMIT_NOFILE)"); + ret = -errno; + goto out_free; + } + + table->entries[i].gsi = -1; + table->entries[i].eventfd = eventfds[i]; + } + + ret = ioctl(device->fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret < 0) + pr_err("Cannot register vfio_irq_set"); + +out_free: + free(irq_set); + return ret; +} + static int vfio_init_irqfd(struct kvm *kvm, int devfd, int gsi) { int ret; @@ -393,31 +713,37 @@ static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device) { int ret; struct vfio_pci_device *pdev = &device->pci; + struct msix_cap *msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); device->irq_info = (struct vfio_irq_info) { - .argsz = sizeof(device->irq_info) + .argsz = sizeof(device->irq_info), + .index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : + VFIO_PCI_INTX_IRQ_INDEX, }; - if (pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX)) { - /* TODO: set up shadow PBA/table structures for MSI-X. */ + ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info); + if (device->irq_info.count == 0) { + pr_err("No interrupt found by VFIO"); + return -ENODEV; + } + + if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { + pr_err("Interrupt not EVENTFD capable"); + return -EINVAL; + } + + if (msix) { + if (device->irq_info.count != pdev->msix_table.nr_entries) { + pr_err("Invalid number of MSI-X reported by VFIO"); + return -EINVAL; + } + + ret = vfio_pci_init_msix_irqfd(kvm, device); } else { int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; - /* We don't have MSI-X, so fall back on INTx */ pr_info("MSI-X not available for device 0x%x, falling back to INTx", device->dev_hdr.dev_num); - device->irq_info.index = VFIO_PCI_INTX_IRQ_INDEX; - ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info); - - if (device->irq_info.count != 1) { - pr_err("No INTx interrupts found"); - return -ENODEV; - } - - if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { - pr_err("INTx interrupt not EVENTFD capable"); - return -EINVAL; - } if (!(device->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { pr_err("INTx interrupt not AUTOMASKED"); @@ -425,11 +751,9 @@ static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device) } ret = vfio_init_irqfd(kvm, device->fd, gsi); - if (ret) - return ret; } - return 0; + return ret; } static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group, -- 2.12.1