On Wed, 26 Oct 2016 16:56:13 +0800 Yongji Xie <xyjxie@xxxxxxxxxxxxxxxxxx> wrote: > Now the kernel commit 05f0c03fbac1 ("vfio-pci: Allow to mmap > sub-page MMIO BARs if the mmio page is exclusive") allows VFIO > to mmap sub-page BARs. This is the corresponding QEMU patch. > With those patches applied, we could passthrough sub-page BARs > to guest, which can help to improve IO performance for some devices. > > In this patch, we expand MemoryRegions of these sub-page > MMIO BARs to PAGE_SIZE in vfio_pci_write_config(), so that > the BARs could be passed to KVM ioctl KVM_SET_USER_MEMORY_REGION > with a valid size. The expanding size will be recovered when > the base address of sub-page BAR is changed and not page aligned > any more in guest. And we also set the priority of these BARs' > memory regions to zero in case of overlap with BARs which share > the same page with sub-page BARs in guest. > > Signed-off-by: Yongji Xie <xyjxie@xxxxxxxxxxxxxxxxxx> > --- > hw/vfio/common.c | 3 +-- > hw/vfio/pci.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 78 insertions(+), 2 deletions(-) > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 9505fb3..6e48f98 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -662,8 +662,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, > region, name, region->size); > > if (!vbasedev->no_mmap && > - region->flags & VFIO_REGION_INFO_FLAG_MMAP && > - !(region->size & ~qemu_real_host_page_mask)) { > + region->flags & VFIO_REGION_INFO_FLAG_MMAP) { > > vfio_setup_region_sparse_mmaps(region, info); > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 65d30fd..0516e62 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -1071,6 +1071,65 @@ static const MemoryRegionOps vfio_vga_ops = { > }; > > /* > + * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page > + * size if the BAR is in an exclusive page in host so that we could map > + * this BAR to guest. But this sub-page BAR may not occupy an exclusive > + * page in guest. So we should set the priority of the expanded memory > + * region to zero in case of overlap with BARs which share the same page > + * with the sub-page BAR in guest. Besides, we should also recover the > + * size of this sub-page BAR when its base address is changed in guest > + * and not page aligned any more. > + */ > +static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) > +{ > + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > + VFIORegion *region = &vdev->bars[bar].region; > + MemoryRegion *mmap_mr, *mr; > + PCIIORegion *r; > + pcibus_t bar_addr; > + > + /* Make sure that the whole region is allowed to be mmapped */ > + if (!(region->nr_mmaps == 1 && > + region->mmaps[0].size == region->size)) { Isn't region->size equal to the BAR size, which is less than PAGE_SIZE? Doesn't that mean that the mmap(2) was performed with a length val less than a page size? Isn't that going to fail? We could also test mmaps[0].mmap here instead of burying it below. > + return; > + } > + > + r = &pdev->io_regions[bar]; > + bar_addr = r->addr; > + if (bar_addr == PCI_BAR_UNMAPPED) { > + return; > + } See below, why not reset sizes on this case? > + > + mr = region->mem; > + mmap_mr = ®ion->mmaps[0].mem; > + memory_region_transaction_begin(); > + if (memory_region_size(mr) < qemu_real_host_page_size) { > + if (!(bar_addr & ~qemu_real_host_page_mask) && > + memory_region_is_mapped(mr) && region->mmaps[0].mmap) { > + /* Expand memory region to page size and set priority */ > + memory_region_del_subregion(r->address_space, mr); > + memory_region_set_size(mr, qemu_real_host_page_size); > + memory_region_set_size(mmap_mr, qemu_real_host_page_size); > + memory_region_add_subregion_overlap(r->address_space, > + bar_addr, mr, 0); > + } > + } else { > + /* This case would happen when guest rescan one PCI device */ > + if (bar_addr & ~qemu_real_host_page_mask) { > + /* Recover the size of memory region */ > + memory_region_set_size(mr, r->size); > + memory_region_set_size(mmap_mr, r->size); > + } else if (memory_region_is_mapped(mr)) { > + /* Set the priority of memory region to zero */ > + memory_region_del_subregion(r->address_space, mr); > + memory_region_add_subregion_overlap(r->address_space, > + bar_addr, mr, 0); > + } > + } > + memory_region_transaction_commit(); The logic flow here is confusing to me, too many cases that presume the previous state of the device. Can't we *always* set the memory region sizes based on the value written to the BAR? And can't we *always* re-prioritize regions when we have a mapped MemoryRegion for which we've modified the size? Something like below (untested) makes a lot more sense to me: diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 03e3c94..d7dbe0e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1087,45 +1087,35 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) MemoryRegion *mmap_mr, *mr; PCIIORegion *r; pcibus_t bar_addr; + uint64_t size = region->size; /* Make sure that the whole region is allowed to be mmapped */ - if (!(region->nr_mmaps == 1 && - region->mmaps[0].size == region->size)) { + if (region->nr_mmaps != 1 || !region->mmaps[0].mmap || + region->mmaps[0].size != region->size) { return; } r = &pdev->io_regions[bar]; bar_addr = r->addr; - if (bar_addr == PCI_BAR_UNMAPPED) { - return; - } - mr = region->mem; mmap_mr = ®ion->mmaps[0].mem; + + /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ + if (bar_addr != PCI_BAR_UNMAPPED && + !(bar_addr & ~qemu_real_host_page_mask)) { + size = qemu_real_host_page_size; + } + memory_region_transaction_begin(); - if (memory_region_size(mr) < qemu_real_host_page_size) { - if (!(bar_addr & ~qemu_real_host_page_mask) && - memory_region_is_mapped(mr) && region->mmaps[0].mmap) { - /* Expand memory region to page size and set priority */ - memory_region_del_subregion(r->address_space, mr); - memory_region_set_size(mr, qemu_real_host_page_size); - memory_region_set_size(mmap_mr, qemu_real_host_page_size); - memory_region_add_subregion_overlap(r->address_space, - bar_addr, mr, 0); - } - } else { - /* This case would happen when guest rescan one PCI device */ - if (bar_addr & ~qemu_real_host_page_mask) { - /* Recover the size of memory region */ - memory_region_set_size(mr, r->size); - memory_region_set_size(mmap_mr, r->size); - } else if (memory_region_is_mapped(mr)) { - /* Set the priority of memory region to zero */ - memory_region_del_subregion(r->address_space, mr); - memory_region_add_subregion_overlap(r->address_space, - bar_addr, mr, 0); - } + + memory_region_set_size(mr, size); + memory_region_set_size(mmap_mr, size); + if (size != region->size && memory_region_is_mapped(mr)) { + memory_region_del_subregion(r->address_space, mr); + memory_region_add_subregion_overlap(r->address_space, + bar_addr, mr, 0); } + memory_region_transaction_commit(); } Please note that QEMU 2.8 goes into freeze on Nov-1, we need to rev quickly to get this in. Thanks, Alex > +} > + > +/* > * PCI config space > */ > uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) > @@ -1153,6 +1212,24 @@ void vfio_pci_write_config(PCIDevice *pdev, > } else if (was_enabled && !is_enabled) { > vfio_msix_disable(vdev); > } > + } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) || > + range_covers_byte(addr, len, PCI_COMMAND)) { > + pcibus_t old_addr[PCI_NUM_REGIONS - 1]; > + int bar; > + > + for (bar = 0; bar < PCI_ROM_SLOT; bar++) { > + old_addr[bar] = pdev->io_regions[bar].addr; > + } > + > + pci_default_write_config(pdev, addr, val, len); > + > + for (bar = 0; bar < PCI_ROM_SLOT; bar++) { > + if (old_addr[bar] != pdev->io_regions[bar].addr && > + pdev->io_regions[bar].size > 0 && > + pdev->io_regions[bar].size < qemu_real_host_page_size) { > + vfio_sub_page_bar_update_mapping(pdev, bar); > + } > + } > } else { > /* Write everything to QEMU to keep emulated bits correct */ > pci_default_write_config(pdev, addr, val, len); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html