On Tue, 12 Dec 2017 17:32:18 +1100 Alexey Kardashevskiy <aik@xxxxxxxxx> wrote: > By default VFIO disables mapping of MSIX BAR to the userspace as > the userspace may program it in a way allowing spurious interrupts; > instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl. > In order to eliminate guessing from the userspace about what is > mmapable, VFIO also advertises a sparse list of regions allowed to mmap. > > This works fine as long as the system page size equals to the MSIX > alignment requirement which is 4KB. However with a bigger page size > the existing code prohibits mapping non-MSIX parts of a page with MSIX > structures so these parts have to be emulated via slow reads/writes on > a VFIO device fd. If these emulated bits are accessed often, this has > serious impact on performance. > > This allows mmap of the entire BAR containing MSIX vector table. > > This removes the sparse capability for PCI devices as it becomes useless. > > As the userspace needs to know for sure whether mmapping of the MSIX > vector containing data can succeed, this adds a new capability - > VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace > that the entire BAR can be mmapped. > > This does not touch the MSIX mangling in the BAR read/write handlers as > we are doing this just to enable direct access to non MSIX registers. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > --- > Changes: > v2: > * rewritten vfio_info_add_capability() to provide same interface for > VFIO_REGION_INFO_CAP_MSIX_MAPPABLE as for other caps. > --- > include/uapi/linux/vfio.h | 10 ++++++++ > drivers/vfio/pci/vfio_pci.c | 62 +++++---------------------------------------- > drivers/vfio/vfio.c | 37 +++++++++++++++++++++++---- > 3 files changed, 48 insertions(+), 61 deletions(-) > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index ae46105..0fb4407 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -300,6 +300,16 @@ struct vfio_region_info_cap_type { > #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > +/* > + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped > + * which allows direct access to non-MSIX registers which happened to be within > + * the same system page. > + * > + * Even though the userspace gets direct access to the MSIX data, the existing > + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration. > + */ > +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 > + > /** > * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, > * struct vfio_irq_info) > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index a98681d..8daa5de 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -565,46 +565,11 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, > return walk.ret; > } > > -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, > - struct vfio_info_cap *caps) > +static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev, > + struct vfio_info_cap *caps) > { > - struct vfio_region_info_cap_sparse_mmap *sparse; > - size_t end, size; > - int nr_areas = 2, i = 0, ret; > - > - end = pci_resource_len(vdev->pdev, vdev->msix_bar); > - > - /* If MSI-X table is aligned to the start or end, only one area */ > - if (((vdev->msix_offset & PAGE_MASK) == 0) || > - (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end)) > - nr_areas = 1; > - > - size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas)); > - > - sparse = kzalloc(size, GFP_KERNEL); > - if (!sparse) > - return -ENOMEM; > - > - sparse->nr_areas = nr_areas; > - > - if (vdev->msix_offset & PAGE_MASK) { > - sparse->areas[i].offset = 0; > - sparse->areas[i].size = vdev->msix_offset & PAGE_MASK; > - i++; > - } > - > - if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) { > - sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset + > - vdev->msix_size); > - sparse->areas[i].size = end - sparse->areas[i].offset; > - i++; > - } > - > - ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP, > - sparse); > - kfree(sparse); > - > - return ret; > + return vfio_info_add_capability(caps, > + VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, NULL); > } > > int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, > @@ -695,7 +660,8 @@ static long vfio_pci_ioctl(void *device_data, > if (vdev->bar_mmap_supported[info.index]) { > info.flags |= VFIO_REGION_INFO_FLAG_MMAP; > if (info.index == vdev->msix_bar) { > - ret = msix_sparse_mmap_cap(vdev, &caps); > + ret = msix_sparse_msix_mmappable_cap( > + vdev, &caps); > if (ret) > return ret; > } > @@ -1125,22 +1091,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) > if (req_start + req_len > phys_len) > return -EINVAL; > > - if (index == vdev->msix_bar) { > - /* > - * Disallow mmaps overlapping the MSI-X table; users don't > - * get to touch this directly. We could find somewhere > - * else to map the overlap, but page granularity is only > - * a recommendation, not a requirement, so the user needs > - * to know which bits are real. Requiring them to mmap > - * around the table makes that clear. > - */ > - > - /* If neither entirely above nor below, then it overlaps */ > - if (!(req_start >= vdev->msix_offset + vdev->msix_size || > - req_start + req_len <= vdev->msix_offset)) > - return -EINVAL; > - } > - > /* > * Even though we don't make use of the barmap for the mmap, > * we need to request the region and the barmap tracks that. > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c > index f5a86f6..2381834 100644 > --- a/drivers/vfio/vfio.c > +++ b/drivers/vfio/vfio.c > @@ -1894,21 +1894,48 @@ static int region_type_cap(struct vfio_info_cap *caps, void *cap_type) > return 0; > } > > +static int header_cap(struct vfio_info_cap *caps, int cap_type_id, > + int cap_type_ver) > +{ > + struct vfio_info_cap_header *header; > + > + header = vfio_info_cap_add(caps, sizeof(*header), cap_type_id, > + cap_type_ver); > + > + if (IS_ERR(header)) > + return PTR_ERR(header); > + > + return 0; > +} > + > int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id, > void *cap_type) > { > int ret = -EINVAL; > > - if (!cap_type) > - return 0; > - > switch (cap_type_id) { > case VFIO_REGION_INFO_CAP_SPARSE_MMAP: > - ret = sparse_mmap_cap(caps, cap_type); > + if (!cap_type) > + ret = 0; > + else > + ret = sparse_mmap_cap(caps, cap_type); > break; > > case VFIO_REGION_INFO_CAP_TYPE: > - ret = region_type_cap(caps, cap_type); > + if (!cap_type) > + ret = 0; > + else > + ret = region_type_cap(caps, cap_type); > + break; > + > + case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE: > + if (!cap_type) > + ret = header_cap(caps, cap_type_id, 1); > + break; > + > + default: > + if (!cap_type) > + ret = 0; > break; > } > Gack, this is even uglier. Does this help: https://lkml.org/lkml/2017/12/12/1083