On Mon, 30 May 2016 21:06:37 +0800 Yongji Xie <xyjxie@xxxxxxxxxxxxxxxxxx> wrote: > Current vfio-pci implementation disallows to mmap > sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio > page may be shared with other BARs. This will cause some > performance issues when we passthrough a PCI device with > this kind of BARs. Guest will be not able to handle the mmio > accesses to the BARs which leads to mmio emulations in host. > > However, not all sub-page BARs will share page with other BARs. > We should allow to mmap the sub-page MMIO BARs which we can > make sure will not share page with other BARs. > > This patch adds support for this case. And we try to add a > dummy resource to reserve the remainder of the page which > hot-add device's BAR might be assigned into. But it's not > necessary to handle the case when the BAR is not page aligned. > Because we can't expect the BAR will be assigned into the same > location in a page in guest when we passthrough the BAR. And > it's hard to access this BAR in userspace because we have > no way to get the BAR's location in a page. > > Signed-off-by: Yongji Xie <xyjxie@xxxxxxxxxxxxxxxxxx> > --- Hi Yongji, On 5/22, message-id <201605230345.u4N3dJip043323@xxxxxxxxxxxxxxxxxxxxxxxxxx> you indicated you'd post the QEMU code which is enabled by this patch "soon". Have I missed that? I'm still waiting to see it. Thanks, Alex > drivers/vfio/pci/vfio_pci.c | 87 ++++++++++++++++++++++++++++++++--- > drivers/vfio/pci/vfio_pci_private.h | 8 ++++ > 2 files changed, 89 insertions(+), 6 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index 188b1ff..3cca2a7 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) > return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; > } > > +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) > +{ > + struct resource *res; > + int bar; > + struct vfio_pci_dummy_resource *dummy_res; > + > + INIT_LIST_HEAD(&vdev->dummy_resources_list); > + > + for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { > + res = vdev->pdev->resource + bar; > + > + if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) > + goto no_mmap; > + > + if (!(res->flags & IORESOURCE_MEM)) > + goto no_mmap; > + > + /* > + * The PCI core shouldn't set up a resource with a > + * type but zero size. But there may be bugs that > + * cause us to do that. > + */ > + if (!resource_size(res)) > + goto no_mmap; > + > + if (resource_size(res) >= PAGE_SIZE) { > + vdev->bar_mmap_supported[bar] = true; > + continue; > + } > + > + if (!(res->start & ~PAGE_MASK)) { > + /* > + * Add a dummy resource to reserve the remainder > + * of the exclusive page in case that hot-add > + * device's bar is assigned into it. > + */ > + dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); > + if (dummy_res == NULL) > + goto no_mmap; > + > + dummy_res->resource.start = res->end + 1; > + dummy_res->resource.end = res->start + PAGE_SIZE - 1; > + dummy_res->resource.flags = res->flags; > + if (request_resource(res->parent, > + &dummy_res->resource)) { > + kfree(dummy_res); > + goto no_mmap; > + } > + dummy_res->index = bar; > + list_add(&dummy_res->res_next, > + &vdev->dummy_resources_list); > + vdev->bar_mmap_supported[bar] = true; > + continue; > + } > + /* > + * Here we don't handle the case when the BAR is not page > + * aligned because we can't expect the BAR will be > + * assigned into the same location in a page in guest > + * when we passthrough the BAR. And it's hard to access > + * this BAR in userspace because we have no way to get > + * the BAR's location in a page. > + */ > +no_mmap: > + vdev->bar_mmap_supported[bar] = false; > + } > +} > + > static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); > static void vfio_pci_disable(struct vfio_pci_device *vdev); > > @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) > } > } > > + vfio_pci_probe_mmaps(vdev); > + > return 0; > } > > static void vfio_pci_disable(struct vfio_pci_device *vdev) > { > struct pci_dev *pdev = vdev->pdev; > + struct vfio_pci_dummy_resource *dummy_res, *tmp; > int i, bar; > > /* Stop the device from further DMA */ > @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > vdev->barmap[bar] = NULL; > } > > + list_for_each_entry_safe(dummy_res, tmp, > + &vdev->dummy_resources_list, res_next) { > + list_del(&dummy_res->res_next); > + release_resource(&dummy_res->resource); > + kfree(dummy_res); > + } > + > vdev->needs_reset = true; > > /* > @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data, > > info.flags = VFIO_REGION_INFO_FLAG_READ | > VFIO_REGION_INFO_FLAG_WRITE; > - if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && > - pci_resource_flags(pdev, info.index) & > - IORESOURCE_MEM && info.size >= PAGE_SIZE) { > + if (vdev->bar_mmap_supported[info.index]) { > info.flags |= VFIO_REGION_INFO_FLAG_MMAP; > if (info.index == vdev->msix_bar) { > ret = msix_sparse_mmap_cap(vdev, &caps); > @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) > return -EINVAL; > if (index >= VFIO_PCI_ROM_REGION_INDEX) > return -EINVAL; > - if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) > + if (!vdev->bar_mmap_supported[index]) > return -EINVAL; > > - phys_len = pci_resource_len(pdev, index); > + phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); > req_len = vma->vm_end - vma->vm_start; > pgoff = vma->vm_pgoff & > ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > req_start = pgoff << PAGE_SHIFT; > > - if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) > + if (req_start + req_len > phys_len) > return -EINVAL; > > if (index == vdev->msix_bar) { > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > index 016c14a..2128de8 100644 > --- a/drivers/vfio/pci/vfio_pci_private.h > +++ b/drivers/vfio/pci/vfio_pci_private.h > @@ -57,9 +57,16 @@ struct vfio_pci_region { > u32 flags; > }; > > +struct vfio_pci_dummy_resource { > + struct resource resource; > + int index; > + struct list_head res_next; > +}; > + > struct vfio_pci_device { > struct pci_dev *pdev; > void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; > + bool bar_mmap_supported[PCI_STD_RESOURCE_END + 1]; > u8 *pci_config_map; > u8 *vconfig; > struct perm_bits *msi_perm; > @@ -88,6 +95,7 @@ struct vfio_pci_device { > int refcnt; > struct eventfd_ctx *err_trigger; > struct eventfd_ctx *req_trigger; > + struct list_head dummy_resources_list; > }; > > #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html