On Thu, Mar 15, 2018 at 03:04:59PM +0000, Jean-Philippe Brucker wrote: > Assigning devices using VFIO allows the guest to have direct access to the > device, whilst filtering accesses to sensitive areas by trapping config > space accesses and mapping DMA with an IOMMU. > > This patch adds a new option to lkvm run: --vfio-pci=<BDF>. Before > assigning a device to a VM, some preparation is required. As described in > Linux Documentation/vfio.txt, the device driver needs to be changed to > vfio-pci: > > $ dev=0000:00:00.0 > > $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind > $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override > $ echo $dev > /sys/bus/pci/drivers_probe > > Adding --vfio-pci=$dev to lkvm-run will pass the device to the guest. > Multiple devices can be passed to the guest by adding more --vfio-pci > parameters. > > This patch only implements PCI with INTx. MSI-X routing will be added in a > subsequent patch, and at some point we might add support for passing > platform devices to guests. > > Signed-off-by: Will Deacon <will.deacon@xxxxxxx> > Signed-off-by: Robin Murphy <robin.murphy@xxxxxxx> > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> [...] > diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h > new file mode 100644 > index 000000000000..71b012184caf > --- /dev/null > +++ b/include/kvm/vfio.h > @@ -0,0 +1,71 @@ > +#ifndef KVM__VFIO_H > +#define KVM__VFIO_H > + > +#include "kvm/parse-options.h" > +#include "kvm/pci.h" > + > +#include <linux/vfio.h> > + > +#define dev_err(vdev, fmt, ...) \ > + pr_err("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__) > +#define dev_warn(vdev, fmt, ...) \ > + pr_warning("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__) > +#define dev_info(vdev, fmt, ...) \ > + pr_info("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__) > +#define dev_dbg(vdev, fmt, ...) \ > + pr_debug("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__) > +#define dev_die(vdev, fmt, ...) \ > + die("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__) We probably want a vfio_ prefix on these macros, since they sound like they also apply to stuff like virtio_device. > diff --git a/vfio/core.c b/vfio/core.c > new file mode 100644 > index 000000000000..25fae8aa783b > --- /dev/null > +++ b/vfio/core.c > @@ -0,0 +1,492 @@ > +#include "kvm/kvm.h" > +#include "kvm/vfio.h" > + > +#include <linux/list.h> > + > +#define VFIO_DEV_DIR "/dev/vfio" > +#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio" > +#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups" > + > +static int vfio_container; > +static LIST_HEAD(vfio_groups); > +static struct vfio_device *vfio_devices; > + > +static int vfio_device_pci_parser(const struct option *opt, char *arg, > + struct vfio_device_params *dev) > +{ > + unsigned int domain, bus, devnr, fn; > + > + int nr = sscanf(arg, "%4x:%2x:%2x.%1x", &domain, &bus, &devnr, &fn); > + if (nr < 4) { > + domain = 0; > + nr = sscanf(arg, "%2x:%2x.%1x", &bus, &devnr, &fn); > + if (nr < 3) { > + pr_err("Invalid device identifier %s", arg); > + return -EINVAL; > + } > + } > + > + dev->type = VFIO_DEVICE_PCI; > + dev->bus = "pci"; > + dev->name = malloc(13); /* Unlucky for us */ Urgh. I hate string processing in C. > + if (!dev->name) > + return -ENOMEM; > + > + snprintf(dev->name, 13, "%04x:%02x:%02x.%x", domain, bus, devnr, fn); > + > + return 0; > +} > + > +int vfio_device_parser(const struct option *opt, const char *arg, int unset) > +{ > + int ret = -EINVAL; > + static int idx = 0; > + struct kvm *kvm = opt->ptr; > + struct vfio_device_params *dev, *devs; > + char *cur, *buf = strdup(arg); > + > + if (!buf) > + return -ENOMEM; > + > + if (idx >= MAX_VFIO_DEVICES) { > + pr_warning("Too many VFIO devices"); > + goto out_free_buf; > + } > + > + devs = realloc(kvm->cfg.vfio_devices, sizeof(*dev) * (idx + 1)); > + if (!devs) { > + ret = -ENOMEM; > + goto out_free_buf; > + } > + > + kvm->cfg.vfio_devices = devs; > + dev = &devs[idx]; > + > + cur = strtok(buf, ","); > + > + if (!strcmp(opt->long_name, "vfio-pci")) > + ret = vfio_device_pci_parser(opt, cur, dev); Do you need to avoid passing cur == NULL to sscanf here? > + else > + ret = -EINVAL; > + > + if (!ret) > + kvm->cfg.num_vfio_devices = ++idx; > + > +out_free_buf: > + free(buf); > + > + return ret; > +} > + > +int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev, > + struct vfio_region *region) > +{ > + void *base; > + int ret, prot = 0; > + /* KVM needs page-aligned regions */ > + u64 map_size = ALIGN(region->info.size, PAGE_SIZE); > + > + /* > + * We don't want to mess about trapping config accesses, so require that > + * they can be mmap'd. Note that for PCI, this precludes the use of I/O > + * BARs in the guest (we will hide them from Configuration Space, which > + * is trapped). > + */ > + if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) { > + dev_info(vdev, "ignoring region %u, as it can't be mmap'd", > + region->info.index); > + return 0; > + } > + > + if (region->info.flags & VFIO_REGION_INFO_FLAG_READ) > + prot |= PROT_READ; > + if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE) > + prot |= PROT_WRITE; > + > + base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd, > + region->info.offset); > + if (base == MAP_FAILED) { > + ret = -errno; > + dev_err(vdev, "failed to mmap region %u (0x%llx bytes)", > + region->info.index, region->info.size); > + return ret; > + } > + region->host_addr = base; > + > + ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size, > + region->host_addr); > + if (ret) { > + dev_err(vdev, "failed to register region with KVM"); > + return ret; > + } > + > + return 0; > +} > + > +void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region) > +{ > + munmap(region->host_addr, region->info.size); > +} > + > +static int vfio_configure_device(struct kvm *kvm, struct vfio_device *vdev) > +{ > + int ret; > + struct vfio_group *group = vdev->group; > + > + vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, > + vdev->params->name); > + if (vdev->fd < 0) { > + dev_warn(vdev, "failed to get fd"); > + > + /* The device might be a bridge without an fd */ > + return 0; > + } > + > + vdev->info.argsz = sizeof(vdev->info); > + if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) { > + ret = -errno; > + dev_err(vdev, "failed to get info"); > + goto err_close_device; > + } > + > + if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET && > + ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0) > + dev_warn(vdev, "failed to reset device"); > + > + vdev->regions = calloc(vdev->info.num_regions, sizeof(*vdev->regions)); > + if (!vdev->regions) { > + ret = -ENOMEM; > + goto err_close_device; > + } > + > + /* Now for the bus-specific initialization... */ > + switch (vdev->params->type) { > + case VFIO_DEVICE_PCI: > + BUG_ON(!(vdev->info.flags & VFIO_DEVICE_FLAGS_PCI)); > + ret = vfio_pci_setup_device(kvm, vdev); > + break; > + default: > + BUG_ON(1); > + ret = -EINVAL; > + } > + > + if (ret) > + goto err_free_regions; > + > + dev_info(vdev, "assigned to device number 0x%x in group %lu", > + vdev->dev_hdr.dev_num, group->id); > + > + return 0; > + > +err_free_regions: > + free(vdev->regions); > +err_close_device: > + close(vdev->fd); > + > + return ret; > +} > + > +static int vfio_configure_devices(struct kvm *kvm) > +{ > + int i, ret; > + > + for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) { > + ret = vfio_configure_device(kvm, &vfio_devices[i]); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static int vfio_get_iommu_type(void) > +{ > + if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) > + return VFIO_TYPE1v2_IOMMU; > + > + if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) > + return VFIO_TYPE1_IOMMU; > + > + return -ENODEV; > +} > + > +static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data) > +{ > + int ret = 0; > + struct vfio_iommu_type1_dma_map dma_map = { > + .argsz = sizeof(dma_map), > + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, > + .vaddr = (unsigned long)bank->host_addr, > + .iova = (u64)bank->guest_phys_addr, > + .size = bank->size, > + }; > + > + /* Map the guest memory for DMA (i.e. provide isolation) */ > + if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) { > + ret = -errno; > + pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA", > + dma_map.iova, dma_map.vaddr, dma_map.size); > + } > + > + return ret; > +} > + > +static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data) > +{ > + struct vfio_iommu_type1_dma_unmap dma_unmap = { > + .argsz = sizeof(dma_unmap), > + .size = bank->size, > + .iova = bank->guest_phys_addr, > + }; > + > + ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap); > + > + return 0; > +} > + > +static struct vfio_group *vfio_group_create(struct kvm *kvm, unsigned long id) > +{ > + int ret; > + struct vfio_group *group; > + char group_node[PATH_MAX]; > + struct vfio_group_status group_status = { > + .argsz = sizeof(group_status), > + }; > + > + group = calloc(1, sizeof(*group)); > + if (!group) > + return NULL; > + > + group->id = id; > + group->refs = 1; > + > + ret = snprintf(group_node, PATH_MAX, VFIO_DEV_DIR "/%lu", id); > + if (ret < 0 || ret == PATH_MAX) > + return NULL; > + > + group->fd = open(group_node, O_RDWR); > + if (group->fd < 0) { > + pr_err("Failed to open IOMMU group %s", group_node); > + goto err_free_group; > + } > + > + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) { > + pr_err("Failed to determine status of IOMMU group %lu", id); > + goto err_close_group; > + } > + > + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { > + pr_err("IOMMU group %lu is not viable", id); > + goto err_close_group; > + } > + > + if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) { > + pr_err("Failed to add IOMMU group %lu to VFIO container", id); > + goto err_close_group; > + } > + > + list_add(&group->list, &vfio_groups); > + > + return group; > + > +err_close_group: > + close(group->fd); > +err_free_group: > + free(group); > + > + return NULL; > +} > + > +static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group) > +{ > + if (--group->refs != 0) > + return; > + > + ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER); > + > + list_del(&group->list); > + close(group->fd); > + free(group); > +} > + > +static struct vfio_group * > +vfio_group_get_for_dev(struct kvm *kvm, struct vfio_device *vdev) > +{ > + int dirfd; > + ssize_t ret; > + char *group_name; > + unsigned long group_id; > + char group_path[PATH_MAX]; > + struct vfio_group *group = NULL; > + > + /* Find IOMMU group for this device */ > + dirfd = open(vdev->sysfs_path, O_DIRECTORY | O_PATH | O_RDONLY); > + if (dirfd < 0) { > + dev_err(vdev, "failed to open '%s'", vdev->sysfs_path); > + return NULL; > + } > + > + ret = readlinkat(dirfd, "iommu_group", group_path, PATH_MAX); > + if (ret < 0) { > + dev_err(vdev, "no iommu_group"); > + goto out_close; > + } > + if (ret == PATH_MAX) > + goto out_close; > + > + group_path[ret] = '\0'; > + > + group_name = basename(group_path); > + errno = 0; > + group_id = strtoul(group_name, NULL, 10); > + if (errno) > + goto out_close; > + > + list_for_each_entry(group, &vfio_groups, list) { > + if (group->id == group_id) { > + group->refs++; > + return group; > + } > + } > + > + group = vfio_group_create(kvm, group_id); > + > +out_close: > + close(dirfd); > + return group; > +} > + > +static int vfio_device_init(struct kvm *kvm, struct vfio_device *vdev) > +{ > + int ret; > + char dev_path[PATH_MAX]; > + struct vfio_group *group; > + > + ret = snprintf(dev_path, PATH_MAX, "/sys/bus/%s/devices/%s", > + vdev->params->bus, vdev->params->name); > + if (ret < 0 || ret == PATH_MAX) > + return -EINVAL; > + > + vdev->sysfs_path = strndup(dev_path, PATH_MAX); > + if (!vdev->sysfs_path) > + return -errno; > + > + group = vfio_group_get_for_dev(kvm, vdev); Hmm, so if I have a number of devices that share a group and I specify the BDF for one of those devices to lkvm, then will it pass through the entire group without warning? Is that desired behaviour? Will