[PATCH v5 kvmtool 00/13] Add vfio-pci support

Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> · Thu, 15 Mar 2018 15:04:51 +0000

Add vfio-pci support to kvmtool. Devices are assigned to the guest by
passing "--vfio-pci [domain:]bus:dev.fn" to lkvm run, after having bound
the device to the VFIO driver (see Documentation/vfio.txt)

This version addresses Punit's comments on v4. Patch 7/13 is new; 8/13
and 9/13 have fixes. You can find the full diff v4..v5 at the end of
this cover letter.

I'm testing the series on arm64 and x86, by passing an Intel x540 NIC to
a guest on AMD Seattle, and an e1000e NIC on QEMU q35.

v4: https://www.spinics.net/lists/kvm/msg159223.html
v5: git://linux-arm.org/kvmtool-jpb.git vfio/v5

Jean-Philippe Brucker (13):
  pci: add config operations callbacks on the PCI header
  pci: allow to specify IRQ type for PCI devices
  irq: add irqfd helpers
  Extend memory bank API with memory types
  pci: add capability helpers
  Import VFIO headers
  Add fls_long and roundup_pow_of_two helpers
  Add PCI device passthrough using VFIO
  vfio-pci: add MSI-X support
  vfio-pci: add MSI support
  vfio: Support non-mmappable regions
  Introduce reserved memory regions
  vfio: check reserved regions before mapping DMA

 Makefile                     |    2 +
 arm/gic.c                    |   74 ++-
 arm/include/arm-common/gic.h |    6 +
 arm/kvm.c                    |    2 +-
 arm/pci.c                    |    4 +-
 builtin-run.c                |    5 +
 hw/pci-shmem.c               |   12 +-
 hw/vesa.c                    |    2 +-
 include/kvm/irq.h            |   17 +
 include/kvm/kvm-config.h     |    3 +
 include/kvm/kvm.h            |   54 +-
 include/kvm/pci.h            |  118 ++++-
 include/kvm/util.h           |   14 +
 include/kvm/vfio.h           |  127 +++++
 include/linux/vfio.h         |  719 +++++++++++++++++++++++++
 irq.c                        |   31 ++
 kvm.c                        |   99 +++-
 mips/kvm.c                   |    6 +-
 pci.c                        |  105 ++--
 powerpc/kvm.c                |    2 +-
 vfio/core.c                  |  674 ++++++++++++++++++++++++
 vfio/pci.c                   | 1193 ++++++++++++++++++++++++++++++++++++++++++
 virtio/net.c                 |    9 +-
 virtio/scsi.c                |   10 +-
 x86/kvm.c                    |    6 +-
 25 files changed, 3171 insertions(+), 123 deletions(-)
 create mode 100644 include/kvm/vfio.h
 create mode 100644 include/linux/vfio.h
 create mode 100644 vfio/core.c
 create mode 100644 vfio/pci.c

-- 
2.16.2

---

diff --git a/include/kvm/util.h b/include/kvm/util.h
index 0df9f0dfdb43..4ca7aa9392b6 100644
--- a/include/kvm/util.h
+++ b/include/kvm/util.h
@@ -90,6 +90,20 @@ static inline void msleep(unsigned int msecs)
 	usleep(MSECS_TO_USECS(msecs));
 }
 
+/*
+ * Find last (most significant) bit set. Same implementation as Linux:
+ * fls(0) = 0, fls(1) = 1, fls(1UL << 63) = 64
+ */
+static inline int fls_long(unsigned long x)
+{
+	return x ? sizeof(x) * 8 - __builtin_clzl(x) : 0;
+}
+
+static inline unsigned long roundup_pow_of_two(unsigned long x)
+{
+	return x ? 1UL << fls_long(x - 1) : 0;
+}
+
 struct kvm;
 void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
 void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
diff --git a/vfio/core.c b/vfio/core.c
index be34a4b80146..248f7d0082cc 100644
--- a/vfio/core.c
+++ b/vfio/core.c
@@ -665,6 +665,10 @@ static int vfio__exit(struct kvm *kvm)
 	free(vfio_devices);
 
 	kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
-	return close(vfio_container);
+	close(vfio_container);
+
+	free(kvm->cfg.vfio_devices);
+
+	return 0;
 }
 dev_base_exit(vfio__exit);
diff --git a/vfio/pci.c b/vfio/pci.c
index 9c3ef2f1a373..f7ac898ec88a 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -150,8 +150,7 @@ static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
 		.count	= 0,
 	};
 
-	if (!msi_is_enabled(msis->phys_state) ||
-	    msi_is_enabled(msis->virt_state))
+	if (!msi_is_enabled(msis->phys_state))
 		return 0;
 
 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
@@ -253,8 +252,17 @@ static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
 
 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
-	/* PCI spec says that software must use aligned 4 or 8 bytes accesses */
 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
+
+	/*
+	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
+	 * for the MSI-X tables.
+	 */
+	if ((len != 4 && len != 8) || addr & (len - 1)) {
+		dev_warn(vdev, "invalid MSI-X table access");
+		return;
+	}
+
 	entry = &pdev->msix.entries[vector];
 
 	mutex_lock(&pdev->msix.mutex);
@@ -266,7 +274,11 @@ static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
 
 	memcpy((void *)&entry->config + field, data, len);
 
-	if (field != PCI_MSIX_ENTRY_VECTOR_CTRL)
+	/*
+	 * Check if access touched the vector control register, which is at the
+	 * end of the MSI-X entry.
+	 */
+	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
 		goto out_unlock;
 
 	msi_set_masked(entry->virt_state, entry->config.ctrl &
@@ -302,12 +314,11 @@ static void vfio_pci_msix_cap_write(struct kvm *kvm,
 	/* Read byte that contains the Enable bit */
 	flags = *(u8 *)(data + enable_pos - off) << 8;
 
-	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
-	msi_set_enabled(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_ENABLE);
+	mutex_lock(&pdev->msix.mutex);
 
+	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
-
-	mutex_lock(&pdev->msix.mutex);
+	msi_set_enabled(pdev->msix.virt_state, enable);
 
 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
 		dev_err(vdev, "cannot enable MSIX");
@@ -702,8 +713,8 @@ static int vfio_pci_create_msix_table(struct kvm *kvm,
 {
 	int ret;
 	size_t i;
+	size_t mmio_size;
 	size_t nr_entries;
-	size_t table_size;
 	struct vfio_pci_msi_entry *entries;
 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
 	struct vfio_pci_msix_table *table = &pdev->msix_table;
@@ -716,7 +727,8 @@ static int vfio_pci_create_msix_table(struct kvm *kvm,
 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
 	 */
 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
-	table_size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
+	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
+	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
 
 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
 	if (!entries)
@@ -727,20 +739,20 @@ static int vfio_pci_create_msix_table(struct kvm *kvm,
 
 	/*
 	 * To ease MSI-X cap configuration in case they share the same BAR,
-	 * collapse table and pending array. According to PCI, address spaces
-	 * must be power of two. Since nr_entries is a power of two, and PBA
-	 * size is less than table_size, reserve 2*table_size.
+	 * collapse table and pending array. The size of the BAR regions must be
+	 * powers of two.
 	 */
-	table->guest_phys_addr = pci_get_io_space_block(2 * table_size);
+	mmio_size = roundup_pow_of_two(table->size + pba->size);
+	table->guest_phys_addr = pci_get_io_space_block(mmio_size);
 	if (!table->guest_phys_addr) {
 		pr_err("cannot allocate IO space");
 		ret = -ENOMEM;
 		goto out_free;
 	}
-	pba->guest_phys_addr = table->guest_phys_addr + table_size;
+	pba->guest_phys_addr = table->guest_phys_addr + table->size;
 
-	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table_size, false,
-				 vfio_pci_msix_table_access, pdev);
+	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
+				 false, vfio_pci_msix_table_access, pdev);
 	if (ret < 0)
 		goto out_free;
 
@@ -752,8 +764,6 @@ static int vfio_pci_create_msix_table(struct kvm *kvm,
 	 * between MSI-X table and PBA. For the sake of isolation, create a
 	 * virtual PBA.
 	 */
-	pba->size = nr_entries / 8;
-
 	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
 				 vfio_pci_msix_pba_access, pdev);
 	if (ret < 0)
@@ -761,7 +771,6 @@ static int vfio_pci_create_msix_table(struct kvm *kvm,
 
 	pdev->msix.entries = entries;
 	pdev->msix.nr_entries = nr_entries;
-	table->size = table_size;
 
 	return 0;
 
@@ -801,7 +810,7 @@ static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
 	region->vdev = vdev;
 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
 	region->info = (struct vfio_region_info) {
-		.argsz = sizeof(*region),
+		.argsz = sizeof(region->info),
 		.index = nr,
 	};