> -----Original Message----- > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx] > Sent: Saturday, October 03, 2015 4:16 AM > To: Bhushan Bharat-R65777 <Bharat.Bhushan@xxxxxxxxxxxxx> > Cc: kvmarm@xxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; > christoffer.dall@xxxxxxxxxx; eric.auger@xxxxxxxxxx; pranavkumar@xxxxxxxxxx; > marc.zyngier@xxxxxxx; will.deacon@xxxxxxx > Subject: Re: [RFC PATCH 4/6] vfio: Add interface to iommu-map/unmap MSI > pages > > On Wed, 2015-09-30 at 20:26 +0530, Bharat Bhushan wrote: > > For MSI interrupts to work for a pass-through devices we need to have > > mapping of msi-pages in iommu. Now on some platforms (like x86) does > > this msi-pages mapping happens magically and in these case they > > chooses an iova which they somehow know that it will never overlap > > with guest memory. But this magic iova selection may not be always > > true for all platform (like PowerPC and ARM64). > > > > Also on x86 platform, there is no problem as long as running a > > x86-guest on x86-host but there can be issues when running a non-x86 > > guest on > > x86 host or other userspace applications like (I think ODP/DPDK). > > As in these cases there can be chances that it overlaps with guest > > memory mapping. > > Wow, it's amazing anything works... smoke and mirrors. > > > This patch add interface to iommu-map and iommu-unmap msi-pages at > > reserved iova chosen by userspace. > > > > Signed-off-by: Bharat Bhushan <Bharat.Bhushan@xxxxxxxxxxxxx> > > --- > > drivers/vfio/vfio.c | 52 +++++++++++++++++++ > > drivers/vfio/vfio_iommu_type1.c | 111 > ++++++++++++++++++++++++++++++++++++++++ > > include/linux/vfio.h | 9 +++- > > 3 files changed, 171 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index > > 2fb29df..a817d2d 100644 > > --- a/drivers/vfio/vfio.c > > +++ b/drivers/vfio/vfio.c > > @@ -605,6 +605,58 @@ static int vfio_iommu_group_notifier(struct > notifier_block *nb, > > return NOTIFY_OK; > > } > > > > +int vfio_device_map_msi(struct vfio_device *device, uint64_t msi_addr, > > + uint32_t size, uint64_t *msi_iova) { > > + struct vfio_container *container = device->group->container; > > + struct vfio_iommu_driver *driver; > > + int ret; > > + > > + /* Validate address and size */ > > + if (!msi_addr || !size || !msi_iova) > > + return -EINVAL; > > + > > + down_read(&container->group_lock); > > + > > + driver = container->iommu_driver; > > + if (!driver || !driver->ops || !driver->ops->msi_map) { > > + up_read(&container->group_lock); > > + return -EINVAL; > > + } > > + > > + ret = driver->ops->msi_map(container->iommu_data, > > + msi_addr, size, msi_iova); > > + > > + up_read(&container->group_lock); > > + return ret; > > +} > > + > > +int vfio_device_unmap_msi(struct vfio_device *device, uint64_t > msi_iova, > > + uint64_t size) > > +{ > > + struct vfio_container *container = device->group->container; > > + struct vfio_iommu_driver *driver; > > + int ret; > > + > > + /* Validate address and size */ > > + if (!msi_iova || !size) > > + return -EINVAL; > > + > > + down_read(&container->group_lock); > > + > > + driver = container->iommu_driver; > > + if (!driver || !driver->ops || !driver->ops->msi_unmap) { > > + up_read(&container->group_lock); > > + return -EINVAL; > > + } > > + > > + ret = driver->ops->msi_unmap(container->iommu_data, > > + msi_iova, size); > > + > > + up_read(&container->group_lock); > > + return ret; > > +} > > + > > /** > > * VFIO driver API > > */ > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index 3315fb6..ab376c2 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -1003,12 +1003,34 @@ out_free: > > return ret; > > } > > > > +static void vfio_iommu_unmap_all_reserved_regions(struct vfio_iommu > > +*iommu) { > > + struct vfio_resvd_region *region; > > + struct vfio_domain *d; > > + > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) { > > + list_for_each_entry(d, &iommu->domain_list, next) { > > + if (!region->map_paddr) > > + continue; > > + > > + if (!iommu_iova_to_phys(d->domain, region->iova)) > > + continue; > > + > > + iommu_unmap(d->domain, region->iova, > PAGE_SIZE); > > PAGE_SIZE? Why not region->size? Yes, this should be region->size. > > > + region->map_paddr = 0; > > + cond_resched(); > > + } > > + } > > +} > > + > > static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) { > > struct rb_node *node; > > > > while ((node = rb_first(&iommu->dma_list))) > > vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, > node)); > > + > > + vfio_iommu_unmap_all_reserved_regions(iommu); > > } > > > > static void vfio_iommu_type1_detach_group(void *iommu_data, @@ > > -1048,6 +1070,93 @@ done: > > mutex_unlock(&iommu->lock); > > } > > > > +static int vfio_iommu_type1_msi_map(void *iommu_data, uint64_t > msi_addr, > > + uint64_t size, uint64_t *msi_iova) { > > + struct vfio_iommu *iommu = iommu_data; > > + struct vfio_resvd_region *region; > > + int ret; > > + > > + mutex_lock(&iommu->lock); > > + > > + /* Do not try ceate iommu-mapping if msi reconfig not allowed */ > > + if (!iommu->allow_msi_reconfig) { > > + mutex_unlock(&iommu->lock); > > + return 0; > > + } > > + > > + /* Check if there is already region mapping the msi page */ > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) { > > + if (region->map_paddr == msi_addr) { > > + *msi_iova = region->iova; > > + region->refcount++; > > + mutex_unlock(&iommu->lock); > > + return 0; > > + } > > + } > > + > > + /* Get a unmapped reserved region */ > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) { > > + if (!region->map_paddr) > > + break; > > + } > > + > > + if (region == NULL) { > > + mutex_unlock(&iommu->lock); > > + return -ENODEV; > > + } > > + > > + ret = vfio_iommu_map(iommu, region->iova, msi_addr >> > PAGE_SHIFT, > > + size >> PAGE_SHIFT, region->prot); > > So the reserved region has a size and the msi mapping has a size and we > arbitrarily decide to use the msi mapping size here? Reserved region interface is generic and user can set reserved region of any size (multiple of page-size). But we do not want to create MSI address mapping beyond the MSI-page otherwise this can be security issue. But I think I am not tracking how much reserved iova region is mapped, so unmap is called for same size. > The overlap checks we've done for the reserved region are meaningless then. No wonder > you're unmapping with PAGE_SIZE, we have no idea. Do you think we should divide the reserved region in pages and track map/unmap per page? > > > + if (ret) { > > + mutex_unlock(&iommu->lock); > > + return ret; > > + } > > + > > + region->map_paddr = msi_addr; > > Is there some sort of implied page alignment with msi_addr? I could pass 0x0 > for one call, 0x1 for another and due to the mapping shift, get two reserved > IOVAs pointing at the same msi page. Page size alignment. Will have a check for alignment. > > > + *msi_iova = region->iova; > > + region->refcount++; > > + > > + mutex_unlock(&iommu->lock); > > + > > + return 0; > > +} > > + > > +static int vfio_iommu_type1_msi_unmap(void *iommu_data, uint64_t > iova, > > + uint64_t size) > > +{ > > + struct vfio_iommu *iommu = iommu_data; > > + struct vfio_resvd_region *region; > > + struct vfio_domain *d; > > + > > + mutex_lock(&iommu->lock); > > + > > + /* find the region mapping the msi page */ > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) > > + if (region->iova == iova) > > + break; > > + > > + if (region == NULL || region->refcount <= 0) { > > + mutex_unlock(&iommu->lock); > > + return -EINVAL; > > + } > > + > > + region->refcount--; > > + if (!region->refcount) { > > + list_for_each_entry(d, &iommu->domain_list, next) { > > + if (!iommu_iova_to_phys(d->domain, iova)) > > + continue; > > + > > + iommu_unmap(d->domain, iova, size); > > And here we're just trusting that the unmap was the same size as the map? > > > + cond_resched(); > > + } > > + } > > + region->map_paddr = 0; > > + > > + mutex_unlock(&iommu->lock); > > + return 0; > > +} > > + > > static void *vfio_iommu_type1_open(unsigned long arg) { > > struct vfio_iommu *iommu; > > @@ -1264,6 +1373,8 @@ static const struct vfio_iommu_driver_ops > vfio_iommu_driver_ops_type1 = { > > .ioctl = vfio_iommu_type1_ioctl, > > .attach_group = vfio_iommu_type1_attach_group, > > .detach_group = vfio_iommu_type1_detach_group, > > + .msi_map = vfio_iommu_type1_msi_map, > > + .msi_unmap = vfio_iommu_type1_msi_unmap, > > }; > > > > static int __init vfio_iommu_type1_init(void) diff --git > > a/include/linux/vfio.h b/include/linux/vfio.h index ddb4409..b91085d > > 100644 > > --- a/include/linux/vfio.h > > +++ b/include/linux/vfio.h > > @@ -52,6 +52,10 @@ extern void *vfio_del_group_dev(struct device > > *dev); extern struct vfio_device *vfio_device_get_from_dev(struct > > device *dev); extern void vfio_device_put(struct vfio_device > > *device); extern void *vfio_device_data(struct vfio_device *device); > > +extern int vfio_device_map_msi(struct vfio_device *device, uint64_t > msi_addr, > > + uint32_t size, uint64_t *msi_iova); int > > +vfio_device_unmap_msi(struct vfio_device *device, uint64_t msi_iova, > > + uint64_t size); > > > > /** > > * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks @@ > > -72,7 +76,10 @@ struct vfio_iommu_driver_ops { > > struct iommu_group *group); > > void (*detach_group)(void *iommu_data, > > struct iommu_group *group); > > - > > + int (*msi_map)(void *iommu_data, uint64_t msi_addr, > > + uint64_t size, uint64_t *msi_iova); > > + int (*msi_unmap)(void *iommu_data, uint64_t > msi_iova, > > + uint64_t size); > > }; > > > > extern int vfio_register_iommu_driver(const struct > > vfio_iommu_driver_ops *ops); > > How did this patch solve any of the problems outlined in the commit log? Problem outlined in the commit log is not solved by this patch but I think we want to solve this by the patch series. I will more the problem to cover-letter. Thanks -Bharat ��.n��������+%������w��{.n�����o�^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�