Re: [PATCH v5 kvmtool 08/13] Add PCI device passthrough using VFIO

Will Deacon <will.deacon@xxxxxxx> · Tue, 12 Jun 2018 14:41:00 +0100

On Thu, Mar 15, 2018 at 03:04:59PM +0000, Jean-Philippe Brucker wrote:
> Assigning devices using VFIO allows the guest to have direct access to the
> device, whilst filtering accesses to sensitive areas by trapping config
> space accesses and mapping DMA with an IOMMU.
> 
> This patch adds a new option to lkvm run: --vfio-pci=<BDF>. Before
> assigning a device to a VM, some preparation is required. As described in
> Linux Documentation/vfio.txt, the device driver needs to be changed to
> vfio-pci:
> 
>   $ dev=0000:00:00.0
> 
>   $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>   $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
>   $ echo $dev > /sys/bus/pci/drivers_probe
> 
> Adding --vfio-pci=$dev to lkvm-run will pass the device to the guest.
> Multiple devices can be passed to the guest by adding more --vfio-pci
> parameters.
> 
> This patch only implements PCI with INTx. MSI-X routing will be added in a
> subsequent patch, and at some point we might add support for passing
> platform devices to guests.
> 
> Signed-off-by: Will Deacon <will.deacon@xxxxxxx>
> Signed-off-by: Robin Murphy <robin.murphy@xxxxxxx>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx>

[...]

> diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
> new file mode 100644
> index 000000000000..71b012184caf
> --- /dev/null
> +++ b/include/kvm/vfio.h
> @@ -0,0 +1,71 @@
> +#ifndef KVM__VFIO_H
> +#define KVM__VFIO_H
> +
> +#include "kvm/parse-options.h"
> +#include "kvm/pci.h"
> +
> +#include <linux/vfio.h>
> +
> +#define dev_err(vdev, fmt, ...) \
> +	pr_err("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
> +#define dev_warn(vdev, fmt, ...) \
> +	pr_warning("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
> +#define dev_info(vdev, fmt, ...) \
> +	pr_info("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
> +#define dev_dbg(vdev, fmt, ...) \
> +	pr_debug("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
> +#define dev_die(vdev, fmt, ...) \
> +	die("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)

We probably want a vfio_ prefix on these macros, since they sound like
they also apply to stuff like virtio_device.

> diff --git a/vfio/core.c b/vfio/core.c
> new file mode 100644
> index 000000000000..25fae8aa783b
> --- /dev/null
> +++ b/vfio/core.c
> @@ -0,0 +1,492 @@
> +#include "kvm/kvm.h"
> +#include "kvm/vfio.h"
> +
> +#include <linux/list.h>
> +
> +#define VFIO_DEV_DIR		"/dev/vfio"
> +#define VFIO_DEV_NODE		VFIO_DEV_DIR "/vfio"
> +#define IOMMU_GROUP_DIR		"/sys/kernel/iommu_groups"
> +
> +static int vfio_container;
> +static LIST_HEAD(vfio_groups);
> +static struct vfio_device *vfio_devices;
> +
> +static int vfio_device_pci_parser(const struct option *opt, char *arg,
> +				  struct vfio_device_params *dev)
> +{
> +	unsigned int domain, bus, devnr, fn;
> +
> +	int nr = sscanf(arg, "%4x:%2x:%2x.%1x", &domain, &bus, &devnr, &fn);
> +	if (nr < 4) {
> +		domain = 0;
> +		nr = sscanf(arg, "%2x:%2x.%1x", &bus, &devnr, &fn);
> +		if (nr < 3) {
> +			pr_err("Invalid device identifier %s", arg);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	dev->type = VFIO_DEVICE_PCI;
> +	dev->bus = "pci";
> +	dev->name = malloc(13);

/* Unlucky for us */

Urgh. I hate string processing in C.

> +	if (!dev->name)
> +		return -ENOMEM;
> +
> +	snprintf(dev->name, 13, "%04x:%02x:%02x.%x", domain, bus, devnr, fn);
> +
> +	return 0;
> +}
> +
> +int vfio_device_parser(const struct option *opt, const char *arg, int unset)
> +{
> +	int ret = -EINVAL;
> +	static int idx = 0;
> +	struct kvm *kvm = opt->ptr;
> +	struct vfio_device_params *dev, *devs;
> +	char *cur, *buf = strdup(arg);
> +
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	if (idx >= MAX_VFIO_DEVICES) {
> +		pr_warning("Too many VFIO devices");
> +		goto out_free_buf;
> +	}
> +
> +	devs = realloc(kvm->cfg.vfio_devices, sizeof(*dev) * (idx + 1));
> +	if (!devs) {
> +		ret = -ENOMEM;
> +		goto out_free_buf;
> +	}
> +
> +	kvm->cfg.vfio_devices = devs;
> +	dev = &devs[idx];
> +
> +	cur = strtok(buf, ",");
> +
> +	if (!strcmp(opt->long_name, "vfio-pci"))
> +		ret = vfio_device_pci_parser(opt, cur, dev);

Do you need to avoid passing cur == NULL to sscanf here?

> +	else
> +		ret = -EINVAL;
> +
> +	if (!ret)
> +		kvm->cfg.num_vfio_devices = ++idx;
> +
> +out_free_buf:
> +	free(buf);
> +
> +	return ret;
> +}
> +
> +int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
> +		    struct vfio_region *region)
> +{
> +	void *base;
> +	int ret, prot = 0;
> +	/* KVM needs page-aligned regions */
> +	u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
> +
> +	/*
> +	 * We don't want to mess about trapping config accesses, so require that
> +	 * they can be mmap'd. Note that for PCI, this precludes the use of I/O
> +	 * BARs in the guest (we will hide them from Configuration Space, which
> +	 * is trapped).
> +	 */
> +	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
> +		dev_info(vdev, "ignoring region %u, as it can't be mmap'd",
> +			 region->info.index);
> +		return 0;
> +	}
> +
> +	if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
> +		prot |= PROT_READ;
> +	if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
> +		prot |= PROT_WRITE;
> +
> +	base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd,
> +		    region->info.offset);
> +	if (base == MAP_FAILED) {
> +		ret = -errno;
> +		dev_err(vdev, "failed to mmap region %u (0x%llx bytes)",
> +			region->info.index, region->info.size);
> +		return ret;
> +	}
> +	region->host_addr = base;
> +
> +	ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
> +				    region->host_addr);
> +	if (ret) {
> +		dev_err(vdev, "failed to register region with KVM");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region)
> +{
> +	munmap(region->host_addr, region->info.size);
> +}
> +
> +static int vfio_configure_device(struct kvm *kvm, struct vfio_device *vdev)
> +{
> +	int ret;
> +	struct vfio_group *group = vdev->group;
> +
> +	vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD,
> +			 vdev->params->name);
> +	if (vdev->fd < 0) {
> +		dev_warn(vdev, "failed to get fd");
> +
> +		/* The device might be a bridge without an fd */
> +		return 0;
> +	}
> +
> +	vdev->info.argsz = sizeof(vdev->info);
> +	if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) {
> +		ret = -errno;
> +		dev_err(vdev, "failed to get info");
> +		goto err_close_device;
> +	}
> +
> +	if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET &&
> +	    ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0)
> +		dev_warn(vdev, "failed to reset device");
> +
> +	vdev->regions = calloc(vdev->info.num_regions, sizeof(*vdev->regions));
> +	if (!vdev->regions) {
> +		ret = -ENOMEM;
> +		goto err_close_device;
> +	}
> +
> +	/* Now for the bus-specific initialization... */
> +	switch (vdev->params->type) {
> +	case VFIO_DEVICE_PCI:
> +		BUG_ON(!(vdev->info.flags & VFIO_DEVICE_FLAGS_PCI));
> +		ret = vfio_pci_setup_device(kvm, vdev);
> +		break;
> +	default:
> +		BUG_ON(1);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto err_free_regions;
> +
> +	dev_info(vdev, "assigned to device number 0x%x in group %lu",
> +		 vdev->dev_hdr.dev_num, group->id);
> +
> +	return 0;
> +
> +err_free_regions:
> +	free(vdev->regions);
> +err_close_device:
> +	close(vdev->fd);
> +
> +	return ret;
> +}
> +
> +static int vfio_configure_devices(struct kvm *kvm)
> +{
> +	int i, ret;
> +
> +	for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
> +		ret = vfio_configure_device(kvm, &vfio_devices[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int vfio_get_iommu_type(void)
> +{
> +	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
> +		return VFIO_TYPE1v2_IOMMU;
> +
> +	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
> +		return VFIO_TYPE1_IOMMU;
> +
> +	return -ENODEV;
> +}
> +
> +static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
> +{
> +	int ret = 0;
> +	struct vfio_iommu_type1_dma_map dma_map = {
> +		.argsz	= sizeof(dma_map),
> +		.flags	= VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
> +		.vaddr	= (unsigned long)bank->host_addr,
> +		.iova	= (u64)bank->guest_phys_addr,
> +		.size	= bank->size,
> +	};
> +
> +	/* Map the guest memory for DMA (i.e. provide isolation) */
> +	if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
> +		ret = -errno;
> +		pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
> +		       dma_map.iova, dma_map.vaddr, dma_map.size);
> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
> +{
> +	struct vfio_iommu_type1_dma_unmap dma_unmap = {
> +		.argsz = sizeof(dma_unmap),
> +		.size = bank->size,
> +		.iova = bank->guest_phys_addr,
> +	};
> +
> +	ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
> +
> +	return 0;
> +}
> +
> +static struct vfio_group *vfio_group_create(struct kvm *kvm, unsigned long id)
> +{
> +	int ret;
> +	struct vfio_group *group;
> +	char group_node[PATH_MAX];
> +	struct vfio_group_status group_status = {
> +		.argsz = sizeof(group_status),
> +	};
> +
> +	group = calloc(1, sizeof(*group));
> +	if (!group)
> +		return NULL;
> +
> +	group->id	= id;
> +	group->refs	= 1;
> +
> +	ret = snprintf(group_node, PATH_MAX, VFIO_DEV_DIR "/%lu", id);
> +	if (ret < 0 || ret == PATH_MAX)
> +		return NULL;
> +
> +	group->fd = open(group_node, O_RDWR);
> +	if (group->fd < 0) {
> +		pr_err("Failed to open IOMMU group %s", group_node);
> +		goto err_free_group;
> +	}
> +
> +	if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
> +		pr_err("Failed to determine status of IOMMU group %lu", id);
> +		goto err_close_group;
> +	}
> +
> +	if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> +		pr_err("IOMMU group %lu is not viable", id);
> +		goto err_close_group;
> +	}
> +
> +	if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
> +		pr_err("Failed to add IOMMU group %lu to VFIO container", id);
> +		goto err_close_group;
> +	}
> +
> +	list_add(&group->list, &vfio_groups);
> +
> +	return group;
> +
> +err_close_group:
> +	close(group->fd);
> +err_free_group:
> +	free(group);
> +
> +	return NULL;
> +}
> +
> +static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group)
> +{
> +	if (--group->refs != 0)
> +		return;
> +
> +	ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER);
> +
> +	list_del(&group->list);
> +	close(group->fd);
> +	free(group);
> +}
> +
> +static struct vfio_group *
> +vfio_group_get_for_dev(struct kvm *kvm, struct vfio_device *vdev)
> +{
> +	int dirfd;
> +	ssize_t ret;
> +	char *group_name;
> +	unsigned long group_id;
> +	char group_path[PATH_MAX];
> +	struct vfio_group *group = NULL;
> +
> +	/* Find IOMMU group for this device */
> +	dirfd = open(vdev->sysfs_path, O_DIRECTORY | O_PATH | O_RDONLY);
> +	if (dirfd < 0) {
> +		dev_err(vdev, "failed to open '%s'", vdev->sysfs_path);
> +		return NULL;
> +	}
> +
> +	ret = readlinkat(dirfd, "iommu_group", group_path, PATH_MAX);
> +	if (ret < 0) {
> +		dev_err(vdev, "no iommu_group");
> +		goto out_close;
> +	}
> +	if (ret == PATH_MAX)
> +		goto out_close;
> +
> +	group_path[ret] = '\0';
> +
> +	group_name = basename(group_path);
> +	errno = 0;
> +	group_id = strtoul(group_name, NULL, 10);
> +	if (errno)
> +		goto out_close;
> +
> +	list_for_each_entry(group, &vfio_groups, list) {
> +		if (group->id == group_id) {
> +			group->refs++;
> +			return group;
> +		}
> +	}
> +
> +	group = vfio_group_create(kvm, group_id);
> +
> +out_close:
> +	close(dirfd);
> +	return group;
> +}
> +
> +static int vfio_device_init(struct kvm *kvm, struct vfio_device *vdev)
> +{
> +	int ret;
> +	char dev_path[PATH_MAX];
> +	struct vfio_group *group;
> +
> +	ret = snprintf(dev_path, PATH_MAX, "/sys/bus/%s/devices/%s",
> +		       vdev->params->bus, vdev->params->name);
> +	if (ret < 0 || ret == PATH_MAX)
> +		return -EINVAL;
> +
> +	vdev->sysfs_path = strndup(dev_path, PATH_MAX);
> +	if (!vdev->sysfs_path)
> +		return -errno;
> +
> +	group = vfio_group_get_for_dev(kvm, vdev);

Hmm, so if I have a number of devices that share a group and I specify the
BDF for one of those devices to lkvm, then will it pass through the entire
group without warning? Is that desired behaviour?

Will