[PATCH kvmtool 06/10] Add PCI device pass-through using VFIO

Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> · Fri, 31 Mar 2017 18:54:46 +0100

Assigning devices using VFIO allows the guest to have direct access to the
device, whilst filtering accesses to sensitive areas by trapping config
space accesses and mapping DMA with an IOMMU.

This patch adds a new option to lkvm run: --vfio-group=<group_number>.
Before assigning a device to a VM, some preparation is required. As
described in Linux Documentation/vfio.txt, the device driver need to be
changed to vfio-pci:

  $ device_num=0000:00:00.0
  $ device_id="1af4 1001"

  $ echo $device_num > /sys/bus/pci/devices/$device_num/driver/unbind
  $ echo $device_id  > /sys/bus/pci/drivers/vfio-pci/new_id
  $ readlink /sys/bus/pci/devices/$device_num/iommu_group
  ../../../kernel/iommu_groups/5

Adding --vfio[-group]=5 to lkvm-run will pass the device to the guest.
Multiple groups can be passed to the guest by adding more --vfio
parameters.

This patch only implements PCI with INTx. MSI-X routing will be added in
a subsequent patch, and at some point we might add support for
passing-through platform devices.

Signed-off-by: Will Deacon <will.deacon@xxxxxxx>
Signed-off-by: Robin Murphy <robin.murphy@xxxxxxx>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx>
---
 Makefile                 |   1 +
 arm/pci.c                |   1 +
 builtin-run.c            |   5 +
 include/kvm/kvm-config.h |   3 +
 include/kvm/pci.h        |   3 +-
 include/kvm/vfio.h       |  39 +++
 vfio.c                   | 706 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 757 insertions(+), 1 deletion(-)
 create mode 100644 include/kvm/vfio.h
 create mode 100644 vfio.c

diff --git a/Makefile b/Makefile
index 57714815..6d5f5d9d 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,7 @@ OBJS	+= main.o
 OBJS	+= mmio.o
 OBJS	+= pci.o
 OBJS	+= term.o
+OBJS	+= vfio.o
 OBJS	+= virtio/blk.o
 OBJS	+= virtio/scsi.o
 OBJS	+= virtio/console.o
diff --git a/arm/pci.c b/arm/pci.c
index 744b14c2..557cfa98 100644
--- a/arm/pci.c
+++ b/arm/pci.c
@@ -1,5 +1,6 @@
 #include "kvm/devices.h"
 #include "kvm/fdt.h"
+#include "kvm/kvm.h"
 #include "kvm/of_pci.h"
 #include "kvm/pci.h"
 #include "kvm/util.h"
diff --git a/builtin-run.c b/builtin-run.c
index 87acb370..b4790ebc 100644
--- a/builtin-run.c
+++ b/builtin-run.c
@@ -146,6 +146,11 @@ void kvm_run_set_wrapper_sandbox(void)
 	OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel"	\
 			" DHCP in rootfs mode"),			\
 									\
+	OPT_GROUP("VFIO options:"),					\
+	OPT_CALLBACK('\0', "vfio-group", NULL, "group number",		\
+			"Pass through a VFIO group to the virtual "	\
+			"machine", vfio_group_parser, kvm),		\
+									\
 	OPT_GROUP("Debug options:"),					\
 	OPT_BOOLEAN('\0', "debug", &do_debug_print,			\
 			"Enable debug messages"),			\
diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
index 386fa8c5..62dc6a2f 100644
--- a/include/kvm/kvm-config.h
+++ b/include/kvm/kvm-config.h
@@ -2,6 +2,7 @@
 #define KVM_CONFIG_H_
 
 #include "kvm/disk-image.h"
+#include "kvm/vfio.h"
 #include "kvm/kvm-config-arch.h"
 
 #define DEFAULT_KVM_DEV		"/dev/kvm"
@@ -20,9 +21,11 @@
 struct kvm_config {
 	struct kvm_config_arch arch;
 	struct disk_image_params disk_image[MAX_DISK_IMAGES];
+	struct vfio_group vfio_group[MAX_VFIO_GROUPS];
 	u64 ram_size;
 	u8  image_count;
 	u8 num_net_devices;
+	u8 num_vfio_groups;
 	bool virtio_rng;
 	int active_console;
 	int debug_iodelay;
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index 2950bb10..44e5adff 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -7,7 +7,6 @@
 #include <endian.h>
 
 #include "kvm/devices.h"
-#include "kvm/kvm.h"
 #include "kvm/msi.h"
 #include "kvm/fdt.h"
 
@@ -22,6 +21,8 @@
 #define PCI_IO_SIZE		0x100
 #define PCI_CFG_SIZE		(1ULL << 24)
 
+struct kvm;
+
 union pci_config_address {
 	struct {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
new file mode 100644
index 00000000..6d2666b0
--- /dev/null
+++ b/include/kvm/vfio.h
@@ -0,0 +1,39 @@
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#define MAX_VFIO_GROUPS			16
+
+struct vfio_pci_device {
+	struct pci_device_header	hdr;
+};
+
+struct vfio_region {
+	struct vfio_region_info		info;
+	u32				guest_phys_addr;
+	void				*host_addr;
+};
+
+struct vfio_device {
+	struct device_header		dev_hdr;
+
+	int				fd;
+	struct vfio_device_info		info;
+	struct vfio_irq_info		irq_info;
+	struct vfio_region		*regions;
+
+	struct vfio_pci_device		pci;
+};
+
+struct vfio_group {
+	unsigned long			id; /* iommu_group number in sysfs */
+	int				fd;
+};
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__VFIO_H */
diff --git a/vfio.c b/vfio.c
new file mode 100644
index 00000000..0f5bc3dd
--- /dev/null
+++ b/vfio.c
@@ -0,0 +1,706 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/vfio.h"
+
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <dirent.h>
+#include <pthread.h>
+
+#define VFIO_DEV_DIR		"/dev/vfio"
+#define VFIO_DEV_NODE		VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR		"/sys/kernel/iommu_groups"
+
+#define VFIO_PATH_MAX_LEN	16
+
+struct vfio_irq_eventfd {
+	struct vfio_irq_set	irq;
+	int			fd;
+};
+
+static int vfio_container;
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset)
+{
+	char *cur, *buf = strdup(arg);
+	static int idx = 0;
+	struct kvm *kvm = opt->ptr;
+	struct vfio_group *group = &kvm->cfg.vfio_group[idx];
+
+	if (idx >= MAX_VFIO_GROUPS) {
+		if (idx++ == MAX_VFIO_GROUPS)
+			pr_warning("Too many VFIO groups");
+		free(buf);
+		return 0;
+	}
+
+	cur = strtok(buf, ",");
+	group->id = strtoul(cur, NULL, 0);
+
+	kvm->cfg.num_vfio_groups = ++idx;
+	free(buf);
+
+	return 0;
+}
+
+static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      u8 offset, void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev;
+	struct vfio_device *device;
+	char base[sz];
+
+	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+	device = container_of(pdev, struct vfio_device, pci);
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	/* Dummy read in case of side-effects */
+	if (pread(device->fd, base, sz, info->offset + offset) != sz)
+		pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+			   sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			       u8 offset, void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev;
+	struct vfio_device *device;
+	void *base = pci_hdr;
+
+	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+	device = container_of(pdev, struct vfio_device, pci);
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	if (pwrite(device->fd, data, sz, info->offset + offset) != sz)
+		pr_warning("Failed to write %d bytes to Configuration Space at 0x%x",
+			   sz, offset);
+
+	if (pread(device->fd, base + offset, sz, info->offset + offset) != sz)
+		pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+			   sz, offset);
+}
+
+static int vfio_pci_parse_caps(struct vfio_device *device)
+{
+	struct vfio_pci_device *pdev = &device->pci;
+
+	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
+	pdev->hdr.capabilities = 0;
+
+	/* TODO: install virtual capabilities */
+
+	return 0;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *device)
+{
+	u8 hdr_type;
+	struct vfio_region_info *info;
+	ssize_t sz = PCI_DEV_CFG_SIZE;
+	struct vfio_pci_device *pdev = &device->pci;
+
+	if (device->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+		pr_err("Configuration Space not found");
+		return -ENODEV;
+	}
+
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	*info = (struct vfio_region_info) {
+			.argsz = sizeof(*info),
+			.index = VFIO_PCI_CONFIG_REGION_INDEX,
+	};
+
+	ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+	if (!info->size) {
+		pr_err("Configuration Space has size zero?!");
+		return -EINVAL;
+	}
+
+	if (pread(device->fd, &pdev->hdr, sz, info->offset) != sz) {
+		pr_err("Failed to read %zd bytes of Configuration Space", sz);
+		return -EIO;
+	}
+
+	/* Strip bit 7, that indicates multifunction */
+	hdr_type = pdev->hdr.header_type & 0x7f;
+
+	if (hdr_type != PCI_HEADER_TYPE_NORMAL) {
+		pr_err("Unsupported header type %u", hdr_type);
+		return -EOPNOTSUPP;
+	}
+
+	if (vfio_pci_parse_caps(device))
+		pr_warning("Failed to parse device capabilities");
+
+	return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
+{
+	int i;
+	ssize_t hdr_sz;
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev = &device->pci;
+
+	/* Enable exclusively MMIO and bus mastering */
+	pdev->hdr.command &= ~PCI_COMMAND_IO;
+	pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+
+	/* Initialise the BARs */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		struct vfio_region *region = &device->regions[i];
+		u32 base = region->guest_phys_addr;
+
+		if (!base)
+			continue;
+
+		pdev->hdr.bar_size[i] = region->info.size;
+
+		/* Construct a fake reg to match what we've mapped. */
+		pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
+					PCI_BASE_ADDRESS_SPACE_MEMORY |
+					PCI_BASE_ADDRESS_MEM_TYPE_32;
+	}
+
+	/* I really can't be bothered to support cardbus. */
+	pdev->hdr.card_bus = 0;
+
+	/*
+	 * Nuke the expansion ROM for now. If we want to do this properly,
+	 * we need to save its size somewhere and map into the guest.
+	 */
+	pdev->hdr.exp_rom_bar = 0;
+
+	/* Install our fake Configuration Space, without the caps */
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	hdr_sz = offsetof(struct pci_device_header, msix);
+	if (pwrite(device->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
+		pr_err("Failed to write %zd bytes to Configuration Space", hdr_sz);
+		return -EIO;
+	}
+
+	/* TODO: install virtual capabilities */
+	/* Register callbacks for cfg accesses */
+	pdev->hdr.cfg_ops = (struct pci_config_operations) {
+		.read	= vfio_pci_cfg_read,
+		.write	= vfio_pci_cfg_write,
+	};
+
+	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
+
+	return 0;
+}
+
+static int vfio_pci_map_bar(struct kvm *kvm, int fd, struct vfio_region *region)
+{
+	void *base;
+	int ret, prot = 0;
+	u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
+
+	/*
+	 * We don't want to mess about trapping BAR accesses, so require
+	 * that they can be mmap'd. Note that this precludes the use of
+	 * I/O BARs in the guest (we will hide them from Configuration
+	 * Space, which is trapped).
+	 */
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
+		pr_info("Ignoring BAR %u, as it can't be mmap'd",
+			region->info.index);
+		return 0;
+	}
+
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+		prot |= PROT_READ;
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+		prot |= PROT_WRITE;
+
+	base = mmap(NULL, region->info.size, prot, MAP_SHARED, fd,
+		    region->info.offset);
+	if (base == MAP_FAILED) {
+		ret = -errno;
+		pr_err("Failed to mmap BAR region %u (0x%llx bytes)",
+			region->info.index, region->info.size);
+		return ret;
+	}
+	region->host_addr = base;
+
+	/* Grab some MMIO space in the guest */
+	region->guest_phys_addr = pci_get_io_space_block(map_size);
+
+	/* Register the BAR as a memory region with KVM */
+	ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
+				    region->host_addr);
+	if (ret) {
+		pr_err("Failed to register BAR as memory region with KVM");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+					  struct vfio_device *device)
+{
+	int ret;
+	u32 i, num_regions = device->info.num_regions;
+
+	ret = vfio_pci_parse_cfg_space(device);
+	if (ret)
+		return ret;
+
+	/* First of all, map the BARs directly into the guest */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		struct vfio_region *region;
+
+		if (i >= num_regions)
+			return 0;
+
+		region = &device->regions[i];
+		region->info = (struct vfio_region_info) {
+			.argsz = sizeof(*region),
+			.index = i,
+		};
+
+		ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, &region->info);
+		/* Ignore invalid or unimplemented regions */
+		if (!region->info.size)
+			continue;
+
+		/*
+		 * Map the BARs into the guest. We'll later need to update
+		 * configuration space to reflect our allocation.
+		 */
+		ret = vfio_pci_map_bar(kvm, device->fd, region);
+		if (ret)
+			return ret;
+	}
+
+	/* We've configured the BARs, fake up a Configuration Space */
+	return vfio_pci_fixup_cfg_space(device);
+}
+
+static int vfio_configure_dev_regions(struct kvm *kvm,
+				      struct vfio_device *device)
+{
+	u32 num_regions = device->info.num_regions;
+
+	/* We only support vfio-pci devices for the moment */
+	if (!(device->info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+		pr_warning("Only vfio-pci devices are supported. "
+			   "Ignoring device regions.");
+		device->info.num_regions = 0;
+		return 0;
+	}
+
+	device->regions = calloc(num_regions, sizeof(*device->regions));
+	if (!device->regions) {
+		pr_err("Failed to allocate %u regions for device",
+			num_regions);
+		return -ENOMEM;
+	}
+
+	return vfio_pci_configure_dev_regions(kvm, device);
+}
+
+static int vfio_init_irqfd(struct kvm *kvm, int devfd, int gsi)
+{
+	int ret;
+	int trigger_fd, unmask_fd;
+	struct vfio_irq_eventfd	trigger;
+	struct vfio_irq_eventfd	unmask;
+
+	/*
+	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
+	 * signals an interrupt from host to guest, and unmask_fd signals the
+	 * deassertion of the line from guest to host.
+	 */
+	trigger_fd = eventfd(0, 0);
+	if (trigger_fd < 0) {
+		pr_err("Failed to create trigger eventfd");
+		return trigger_fd;
+	}
+
+	unmask_fd = eventfd(0, 0);
+	if (unmask_fd < 0) {
+		pr_err("Failed to create unmask eventfd");
+		close(trigger_fd);
+		return unmask_fd;
+	}
+
+	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
+	if (ret)
+		goto err_close;
+
+	trigger.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(trigger),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	trigger.fd = trigger_fd;
+
+	ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger);
+	if (ret < 0) {
+		pr_err("Failed to setup VFIO IRQ");
+		goto err_delete_line;
+	}
+
+	unmask.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(unmask),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	unmask.fd = unmask_fd;
+
+	ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &unmask);
+	if (ret < 0) {
+		pr_err("Failed to setup unmask IRQ");
+		goto err_remove_event;
+	}
+
+	return 0;
+
+err_remove_event:
+	/* Remove trigger event */
+	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger.irq);
+
+err_delete_line:
+	irq__del_irqfd(kvm, gsi);
+
+err_close:
+	close(trigger_fd);
+	close(unmask_fd);
+	return ret;
+}
+
+static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
+{
+	int ret;
+	struct vfio_pci_device *pdev = &device->pci;
+
+	device->irq_info = (struct vfio_irq_info) {
+		.argsz = sizeof(device->irq_info)
+	};
+
+	if (pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX)) {
+		/* TODO: set up shadow PBA/table structures for MSI-X. */
+	} else {
+		int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
+
+		/* We don't have MSI-X, so fall back on INTx */
+		pr_info("MSI-X not available for device 0x%x, falling back to INTx",
+			device->dev_hdr.dev_num);
+		device->irq_info.index = VFIO_PCI_INTX_IRQ_INDEX;
+		ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info);
+
+		if (device->irq_info.count != 1) {
+			pr_err("No INTx interrupts found");
+			return -ENODEV;
+		}
+
+		if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+			pr_err("INTx interrupt not EVENTFD capable");
+			return -EINVAL;
+		}
+
+		if (!(device->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+			pr_err("INTx interrupt not AUTOMASKED");
+			return -EINVAL;
+		}
+
+		ret = vfio_init_irqfd(kvm, device->fd, gsi);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group,
+				 struct dirent *dirent)
+{
+	int ret;
+	struct vfio_device *device;
+
+	device = calloc(1, sizeof(*device));
+	if (!device) {
+		pr_err("Failed to allocate VFIO device");
+		return -ENOMEM;
+	}
+
+	device->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, dirent->d_name);
+	if (device->fd < 0) {
+		pr_err("Failed to get FD for device %s in group %lu",
+		       dirent->d_name, group->id);
+		free(device);
+
+		/* The device might be a bridge without an fd */
+		return 0;
+	}
+
+	device->info.argsz = sizeof(device->info);
+	if (ioctl(device->fd, VFIO_DEVICE_GET_INFO, &device->info)) {
+		ret = -errno;
+		pr_err("Failed to get info for device %s in group %lu",
+		       dirent->d_name, group->id);
+		return ret;
+	}
+
+	if (device->info.flags & VFIO_DEVICE_FLAGS_RESET &&
+	    ioctl(device->fd, VFIO_DEVICE_RESET) < 0)
+		pr_warning("Failed to reset device %s in group %lu",
+			   dirent->d_name, group->id);
+
+	ret = vfio_configure_dev_regions(kvm, device);
+	if (ret) {
+		pr_err("Failed to configure regions for device %s in group %lu",
+		       dirent->d_name, group->id);
+		return ret;
+	}
+
+	device->dev_hdr = (struct device_header) {
+		.bus_type	= DEVICE_BUS_PCI,
+		.data		= &device->pci.hdr,
+	};
+
+	ret = device__register(&device->dev_hdr);
+	if (ret) {
+		pr_err("Failed to register VFIO device");
+		return ret;
+	}
+
+	ret = vfio_configure_dev_irqs(kvm, device);
+	if (ret) {
+		pr_err("Failed to configure IRQs for device %s in group%lu",
+		       dirent->d_name, group->id);
+		return ret;
+	}
+
+	pr_info("Assigned device %s in group %lu to device number 0x%x",
+		dirent->d_name, group->id, device->dev_hdr.dev_num);
+
+	return 0;
+}
+
+static int vfio_configure_iommu_groups(struct kvm *kvm)
+{
+	int i, ret;
+
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		DIR *dir;
+		struct dirent *dirent;
+		char dirpath[PATH_MAX];
+		struct vfio_group *group = &kvm->cfg.vfio_group[i];
+
+		snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
+			 group->id);
+
+		dir = opendir(dirpath);
+		if (!dir) {
+			ret = -errno;
+			pr_err("Failed to open IOMMU group %s", dirpath);
+			return ret;
+		}
+
+		while ((dirent = readdir(dir))) {
+			if (dirent->d_type != DT_LNK)
+				continue;
+
+			ret = vfio_configure_device(kvm, group, dirent);
+			if (ret)
+				return ret;
+		}
+
+		if (closedir(dir))
+			pr_warning("Failed to close IOMMU group %s", dirpath);
+	}
+
+	return 0;
+}
+
+/* TODO: this should be an arch callback, so arm can return HYP only if vsmmu */
+static int vfio_get_iommu_type(void)
+{
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
+		return VFIO_TYPE1_NESTING_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+		return VFIO_TYPE1v2_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+		return VFIO_TYPE1_IOMMU;
+
+	return -ENODEV;
+}
+
+static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+	int ret = 0;
+	struct vfio_iommu_type1_dma_map dma_map = {
+		.argsz	= sizeof(dma_map),
+		.flags	= VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+		.vaddr	= (u64)bank->host_addr,
+		.iova	= (u64)bank->guest_phys_addr,
+		.size	= bank->size,
+	};
+
+	/* Map the guest memory for DMA (i.e. provide isolation) */
+	if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+		ret = -errno;
+		pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
+		       dma_map.iova, dma_map.vaddr, dma_map.size);
+	}
+
+	return ret;
+}
+
+static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+	struct vfio_iommu_type1_dma_unmap dma_unmap = {
+		.argsz = sizeof(dma_unmap),
+		.size = bank->size,
+		.iova = bank->guest_phys_addr,
+	};
+
+	ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+
+	return 0;
+}
+
+static int vfio_group_init(struct kvm *kvm, struct vfio_group *group)
+{
+	int ret;
+	char group_node[VFIO_PATH_MAX_LEN];
+	struct vfio_group_status group_status = {
+		.argsz = sizeof(group_status),
+	};
+
+	snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
+		 group->id);
+
+	group->fd = open(group_node, O_RDWR);
+	if (group->fd == -1) {
+		ret = -errno;
+		pr_err("Failed to open IOMMU group %s", group_node);
+		return ret;
+	}
+
+	if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+		ret = -errno;
+		pr_err("Failed to determine status of IOMMU group %s",
+		       group_node);
+		return ret;
+	}
+
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		pr_err("IOMMU group %s is not viable", group_node);
+		return -EINVAL;
+	}
+
+	if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+		ret = -errno;
+		pr_err("Failed to add IOMMU group %s to VFIO container",
+		       group_node);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_container_init(struct kvm *kvm)
+{
+	int api, i, ret, iommu_type;;
+
+	/* Create a container for our IOMMU groups */
+	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+	if (vfio_container == -1) {
+		ret = errno;
+		pr_err("Failed to open %s", VFIO_DEV_NODE);
+		return ret;
+	}
+
+	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+	if (api != VFIO_API_VERSION) {
+		pr_err("Unknown VFIO API version %d", api);
+		return -ENODEV;
+	}
+
+	iommu_type = vfio_get_iommu_type();
+	if (iommu_type < 0) {
+		pr_err("VFIO type-1 IOMMU not supported on this platform");
+		return iommu_type;
+	}
+
+	/* Sanity check our groups and add them to the container */
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
+		if (ret)
+			return ret;
+	}
+
+	/* Finalise the container */
+	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+		ret = -errno;
+		pr_err("Failed to set IOMMU type %d for VFIO container",
+		       iommu_type);
+		return ret;
+	} else {
+		pr_info("Using IOMMU type %d for VFIO container", iommu_type);
+	}
+
+	return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
+				      NULL);
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm->cfg.num_vfio_groups)
+		return 0;
+
+	ret = vfio_container_init(kvm);
+	if (ret)
+		return ret;
+
+	ret = vfio_configure_iommu_groups(kvm);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+	int i, fd;
+
+	if (!kvm->cfg.num_vfio_groups)
+		return 0;
+
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		fd = kvm->cfg.vfio_group[i].fd;
+		ioctl(fd, VFIO_GROUP_UNSET_CONTAINER);
+		close(fd);
+	}
+
+	kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
+	return close(vfio_container);
+}
+dev_base_exit(vfio__exit);
-- 
2.12.1