On Tue, 2015-10-06 at 09:39 +0000, Bhushan Bharat wrote: > > > > -----Original Message----- > > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx] > > Sent: Tuesday, October 06, 2015 4:15 AM > > To: Bhushan Bharat-R65777 <Bharat.Bhushan@xxxxxxxxxxxxx> > > Cc: kvmarm@xxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; > > christoffer.dall@xxxxxxxxxx; eric.auger@xxxxxxxxxx; pranavkumar@xxxxxxxxxx; > > marc.zyngier@xxxxxxx; will.deacon@xxxxxxx > > Subject: Re: [RFC PATCH 1/6] vfio: Add interface for add/del reserved iova > > region > > > > On Mon, 2015-10-05 at 04:55 +0000, Bhushan Bharat wrote: > > > Hi Alex, > > > > > > > -----Original Message----- > > > > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx] > > > > Sent: Saturday, October 03, 2015 4:16 AM > > > > To: Bhushan Bharat-R65777 <Bharat.Bhushan@xxxxxxxxxxxxx> > > > > Cc: kvmarm@xxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; > > > > christoffer.dall@xxxxxxxxxx; eric.auger@xxxxxxxxxx; > > > > pranavkumar@xxxxxxxxxx; marc.zyngier@xxxxxxx; will.deacon@xxxxxxx > > > > Subject: Re: [RFC PATCH 1/6] vfio: Add interface for add/del > > > > reserved iova region > > > > > > > > On Wed, 2015-09-30 at 20:26 +0530, Bharat Bhushan wrote: > > > > > This Patch adds the VFIO APIs to add and remove reserved iova regions. > > > > > The reserved iova region can be used for mapping some specific > > > > > physical address in iommu. > > > > > > > > > > Currently we are planning to use this interface for adding iova > > > > > regions for creating iommu of msi-pages. But the API are designed > > > > > for future extension where some other physical address can be > > mapped. > > > > > > > > > > Signed-off-by: Bharat Bhushan <Bharat.Bhushan@xxxxxxxxxxxxx> > > > > > --- > > > > > drivers/vfio/vfio_iommu_type1.c | 201 > > > > +++++++++++++++++++++++++++++++++++++++- > > > > > include/uapi/linux/vfio.h | 43 +++++++++ > > > > > 2 files changed, 243 insertions(+), 1 deletion(-) > > > > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > > > > b/drivers/vfio/vfio_iommu_type1.c index 57d8c37..fa5d3e4 100644 > > > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > > > @@ -59,6 +59,7 @@ struct vfio_iommu { > > > > > struct rb_root dma_list; > > > > > bool v2; > > > > > bool nesting; > > > > > + struct list_head reserved_iova_list; > > > > > > > > This alignment leads to poor packing in the structure, put it above the > > bools. > > > > > > ok > > > > > > > > > > > > }; > > > > > > > > > > struct vfio_domain { > > > > > @@ -77,6 +78,15 @@ struct vfio_dma { > > > > > int prot; /* IOMMU_READ/WRITE */ > > > > > }; > > > > > > > > > > +struct vfio_resvd_region { > > > > > + dma_addr_t iova; > > > > > + size_t size; > > > > > + int prot; /* IOMMU_READ/WRITE */ > > > > > + int refcount; /* ref count of mappings */ > > > > > + uint64_t map_paddr; /* Mapped Physical Address > > > > */ > > > > > > > > phys_addr_t > > > > > > Ok, > > > > > > > > > > > > + struct list_head next; > > > > > +}; > > > > > + > > > > > struct vfio_group { > > > > > struct iommu_group *iommu_group; > > > > > struct list_head next; > > > > > @@ -106,6 +116,38 @@ static struct vfio_dma *vfio_find_dma(struct > > > > vfio_iommu *iommu, > > > > > return NULL; > > > > > } > > > > > > > > > > +/* This function must be called with iommu->lock held */ static > > > > > +bool vfio_overlap_with_resvd_region(struct vfio_iommu *iommu, > > > > > + dma_addr_t start, size_t size) { > > > > > + struct vfio_resvd_region *region; > > > > > + > > > > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) { > > > > > + if (region->iova < start) > > > > > + return (start - region->iova < region->size); > > > > > + else if (start < region->iova) > > > > > + return (region->iova - start < size); > > > > > > > > <= on both of the return lines? > > > > > > I think is should be "<" and not "=<", no ? > > > > Yep, looks like you're right. Maybe there's a more straightforward way to do > > this. > > > > > > > > > > > + > > > > > + return (region->size > 0 && size > 0); > > > > > + } > > > > > + > > > > > + return false; > > > > > +} > > > > > + > > > > > +/* This function must be called with iommu->lock held */ static > > > > > +struct vfio_resvd_region *vfio_find_resvd_region(struct > > > > > +vfio_iommu > > > > *iommu, > > > > > + dma_addr_t start, size_t > > > > size) { > > > > > + struct vfio_resvd_region *region; > > > > > + > > > > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) > > > > > + if (region->iova == start && region->size == size) > > > > > + return region; > > > > > + > > > > > + return NULL; > > > > > +} > > > > > + > > > > > static void vfio_link_dma(struct vfio_iommu *iommu, struct > > > > > vfio_dma > > > > > *new) { > > > > > struct rb_node **link = &iommu->dma_list.rb_node, *parent = > > > > NULL; @@ > > > > > -580,7 +622,8 @@ static int vfio_dma_do_map(struct vfio_iommu > > > > > *iommu, > > > > > > > > > > mutex_lock(&iommu->lock); > > > > > > > > > > - if (vfio_find_dma(iommu, iova, size)) { > > > > > + if (vfio_find_dma(iommu, iova, size) || > > > > > + vfio_overlap_with_resvd_region(iommu, iova, size)) { > > > > > mutex_unlock(&iommu->lock); > > > > > return -EEXIST; > > > > > } > > > > > @@ -626,6 +669,127 @@ static int vfio_dma_do_map(struct > > vfio_iommu > > > > *iommu, > > > > > return ret; > > > > > } > > > > > > > > > > +/* This function must be called with iommu->lock held */ static > > > > > +int vfio_iommu_resvd_region_del(struct vfio_iommu *iommu, > > > > > + dma_addr_t iova, size_t size, int prot) { > > > > > + struct vfio_resvd_region *res_region; > > > > > > > > Have some consistency in naming, just use "region". > > > > > > Ok, > > > > > > > > + > > > > > + res_region = vfio_find_resvd_region(iommu, iova, size); > > > > > + /* Region should not be mapped in iommu */ > > > > > + if (res_region == NULL || res_region->map_paddr) > > > > > + return -EINVAL; > > > > > > > > Are these two separate errors? !region is -EINVAL, but being mapped > > > > is - EBUSY. > > > > > > Yes, will separate them. > > > > > > > > > > > > + > > > > > + list_del(&res_region->next); > > > > > + kfree(res_region); > > > > > + return 0; > > > > > +} > > > > > + > > > > > +/* This function must be called with iommu->lock held */ static > > > > > +int vfio_iommu_resvd_region_add(struct vfio_iommu *iommu, > > > > > + dma_addr_t iova, size_t size, int prot) { > > > > > + struct vfio_resvd_region *res_region; > > > > > + > > > > > + /* Check overlap with with dma maping and reserved regions */ > > > > > + if (vfio_find_dma(iommu, iova, size) || > > > > > + vfio_find_resvd_region(iommu, iova, size)) > > > > > + return -EEXIST; > > > > > + > > > > > + res_region = kzalloc(sizeof(*res_region), GFP_KERNEL); > > > > > + if (res_region == NULL) > > > > > + return -ENOMEM; > > > > > + > > > > > + res_region->iova = iova; > > > > > + res_region->size = size; > > > > > + res_region->prot = prot; > > > > > + res_region->refcount = 0; > > > > > + res_region->map_paddr = 0; > > > > > > > > They're already 0 by the kzalloc > > > > > > Yes ;) > > > > > > > > > + > > > > > + list_add(&res_region->next, &iommu->reserved_iova_list); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +static > > > > > +int vfio_handle_reserved_region_add(struct vfio_iommu *iommu, > > > > > + struct vfio_iommu_reserved_region_add > > > > *region) { > > > > > + dma_addr_t iova = region->iova; > > > > > + size_t size = region->size; > > > > > + int flags = region->flags; > > > > > + uint64_t mask; > > > > > + int prot = 0; > > > > > + int ret; > > > > > + > > > > > + /* Verify that none of our __u64 fields overflow */ > > > > > + if (region->size != size || region->iova != iova) > > > > > + return -EINVAL; > > > > > + > > > > > + mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; > > > > > + > > > > > + WARN_ON(mask & PAGE_MASK); > > > > > + > > > > > + if (flags & VFIO_IOMMU_RES_REGION_READ) > > > > > + prot |= IOMMU_READ; > > > > > + if (flags & VFIO_IOMMU_RES_REGION_WRITE) > > > > > + prot |= IOMMU_WRITE; > > > > > + > > > > > + if (!prot || !size || (size | iova) & mask) > > > > > + return -EINVAL; > > > > > + > > > > > + /* Don't allow IOVA wrap */ > > > > > + if (iova + size - 1 < iova) > > > > > + return -EINVAL; > > > > > + > > > > > + mutex_lock(&iommu->lock); > > > > > + > > > > > + if (region->flags & VFIO_IOMMU_RES_REGION_ADD) { > > > > > + ret = vfio_iommu_resvd_region_add(iommu, iova, size, > > > > prot); > > > > > + if (ret) { > > > > > + mutex_unlock(&iommu->lock); > > > > > + return ret; > > > > > + } > > > > > + } > > > > > > > > Silently fail if not VFIO_IOMMU_RES_REGION_ADD? > > > > > > As per below comment we do not need this flag. So the above flag > > checking will be removed. > > > > > > > > > > > > + > > > > > + mutex_unlock(&iommu->lock); > > > > > + return 0; > > > > > +} > > > > > + > > > > > +static > > > > > +int vfio_handle_reserved_region_del(struct vfio_iommu *iommu, > > > > > + struct vfio_iommu_reserved_region_del > > > > *region) { > > > > > + dma_addr_t iova = region->iova; > > > > > + size_t size = region->size; > > > > > + int flags = region->flags; > > > > > + int ret; > > > > > + > > > > > + if (!(flags & VFIO_IOMMU_RES_REGION_DEL)) > > > > > + return -EINVAL; > > > > > + > > > > > + mutex_lock(&iommu->lock); > > > > > + > > > > > + /* Check for the region */ > > > > > + if (vfio_find_resvd_region(iommu, iova, size) == NULL) { > > > > > + mutex_unlock(&iommu->lock); > > > > > + return -EINVAL; > > > > > + } > > > > > + > > > > > + /* remove the reserved region */ > > > > > + if (region->flags & VFIO_IOMMU_RES_REGION_DEL) { > > > > > + ret = vfio_iommu_resvd_region_del(iommu, iova, size, > > > > flags); > > > > > + if (ret) { > > > > > + mutex_unlock(&iommu->lock); > > > > > + return ret; > > > > > + } > > > > > + } > > > > > + > > > > > + mutex_unlock(&iommu->lock); > > > > > + return 0; > > > > > +} > > > > > + > > > > > static int vfio_bus_type(struct device *dev, void *data) { > > > > > struct bus_type **bus = data; > > > > > @@ -905,6 +1069,7 @@ static void > > *vfio_iommu_type1_open(unsigned > > > > long arg) > > > > > } > > > > > > > > > > INIT_LIST_HEAD(&iommu->domain_list); > > > > > + INIT_LIST_HEAD(&iommu->reserved_iova_list); > > > > > iommu->dma_list = RB_ROOT; > > > > > mutex_init(&iommu->lock); > > > > > > > > > > @@ -1020,6 +1185,40 @@ static long vfio_iommu_type1_ioctl(void > > > > *iommu_data, > > > > > return ret; > > > > > > > > > > return copy_to_user((void __user *)arg, &unmap, minsz); > > > > > + } else if (cmd == VFIO_IOMMU_RESERVED_REGION_ADD) { > > > > > + struct vfio_iommu_reserved_region_add region; > > > > > + long ret; > > > > > + > > > > > + minsz = offsetofend(struct > > > > vfio_iommu_reserved_region_add, > > > > > + size); > > > > > + if (copy_from_user(®ion, (void __user *)arg, minsz)) > > > > > + return -EFAULT; > > > > > + > > > > > + if (region.argsz < minsz) > > > > > + return -EINVAL; > > > > > + > > > > > + ret = vfio_handle_reserved_region_add(iommu, ®ion); > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > + return copy_to_user((void __user *)arg, ®ion, minsz); > > > > > + } else if (cmd == VFIO_IOMMU_RESERVED_REGION_DEL) { > > > > > + struct vfio_iommu_reserved_region_del region; > > > > > + long ret; > > > > > + > > > > > + minsz = offsetofend(struct > > > > vfio_iommu_reserved_region_del, > > > > > + size); > > > > > + if (copy_from_user(®ion, (void __user *)arg, minsz)) > > > > > + return -EFAULT; > > > > > + > > > > > + if (region.argsz < minsz) > > > > > + return -EINVAL; > > > > > + > > > > > + ret = vfio_handle_reserved_region_del(iommu, ®ion); > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > + return copy_to_user((void __user *)arg, ®ion, minsz); > > > > > > > > So we've just created an interface that is available for all > > > > vfio-type1 users, whether it makes any sense for the platform or > > > > not, > > > > > > How we should decide that a given platform needs this or not? > > > > You later add new iommu interfaces, presumably if the iommu doesn't > > implement those interfaces then there's no point in us exposing these > > interfaces to vfio. > > You mean if an iommu says "does not requires explicit-iommu-mapping MSIs" then if user-space calls these reserved-iova ioctls then we should return error without adding these regions, right? Yes, what would it mean for the user to add reserved regions if we have no means to use them? > > > > and it allows the user to > > > > consume arbitrary amounts of kernel memory, by making an infinitely > > > > long list of reserved iova entries, brilliant! > > > > > > I was not sure of how to limit the user. What I was thinking of having a > > default number of pages a user can reserve (512 pages). Also we can give a > > sysfs interface so that user can change the default number of pages. Does > > this sound good? If not please suggest? > > > > Isn't 512 entries a lot for a linked list? > > Can we use our existing rb tree to manage these entries rather than a secondary list? > > I do not think so, it will complicate the code. That's not a good answer. Is a flag on the rb entry insufficient? Why is that more complicated than a completely separate list? We can never have reserved mappings and dma mappings occupy the same IOVA space and the rb tree is tracking iova space, so it seems like a natural place to track both dma and reserved IOVA. > > How many entries do we realistically need? > > I think this should be small number of entries as discussed on other patch. Small number means 1 or 2 max on PowerPC and smmu (I think so). > > > Can the iommu callbacks help give us a limit? > > I am not sure right now how deterministically we can get number of regions/msi-pages needed for a given platform and for a list of devices in a group? > > > Can we somehow use information about the devices in the group to produce a limit, > > ie. MSI vectors possible from the group? > > It seems like we need to know number of vectors needed per device in the group and we needed to know how many vectors can supported using the reserved msi-page. We are sharing a msi-page then it becomes a little more complicated. But overall it seems like we need a 2-3 or very small number of regions and I will suggest that we can go with this small number can enhance if required later on. > > > > > > > > > > > > } > > > > > > > > > > return -ENOTTY; > > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > > > index b57b750..1abd1a9 100644 > > > > > --- a/include/uapi/linux/vfio.h > > > > > +++ b/include/uapi/linux/vfio.h > > > > > @@ -440,6 +440,49 @@ struct vfio_iommu_type1_dma_unmap { > > > > > #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) > > > > > #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) > > > > > > > > > > +/**************** Reserved IOVA region specific APIs > > > > > +**********************/ > > > > > + > > > > > +/* > > > > > + * VFIO_IOMMU_RESERVED_REGION_ADD - _IO(VFIO_TYPE, > > VFIO_BASE > > > > + 17, > > > > > + * struct > > > > vfio_iommu_reserved_region_add) > > > > > + * This is used to add a reserved iova region. > > > > > + * @flags - Input: VFIO_IOMMU_RES_REGION_ADD flag is for adding > > > > > + * a reserved region. > > > > > > > > Why else would we call VFIO_IOMMU_RESERVED_REGION_ADD except > > to add > > > > a region, this flag is redundant. > > > > > > Ok, will remove this. > > > > > > > > > > > > + * Also pass READ/WRITE/IOMMU flags to be used in iommu mapping. > > > > > + * @iova - Input: IOVA base address of reserved region > > > > > + * @size - Input: Size of the reserved region > > > > > + * Return: 0 on success, -errno on failure */ struct > > > > > +vfio_iommu_reserved_region_add { > > > > > + __u32 argsz; > > > > > + __u32 flags; > > > > > +#define VFIO_IOMMU_RES_REGION_ADD (1 << 0) /* Add a > > reserved > > > > region */ > > > > > +#define VFIO_IOMMU_RES_REGION_READ (1 << 1) /* readable > > region */ > > > > > +#define VFIO_IOMMU_RES_REGION_WRITE (1 << 2) /* writable > > > > region */ > > > > > + __u64 iova; > > > > > + __u64 size; > > > > > +}; > > > > > +#define VFIO_IOMMU_RESERVED_REGION_ADD _IO(VFIO_TYPE, > > > > VFIO_BASE + 17) > > > > > + > > > > > +/* > > > > > + * VFIO_IOMMU_RESERVED_REGION_DEL - _IO(VFIO_TYPE, > > VFIO_BASE + > > > > 18, > > > > > + * struct > > > > vfio_iommu_reserved_region_del) > > > > > + * This is used to delete an existing reserved iova region. > > > > > + * @flags - VFIO_IOMMU_RES_REGION_DEL flag is for deleting a > > > > > +region use, > > > > > + * only a unmapped region can be deleted. > > > > > + * @iova - Input: IOVA base address of reserved region > > > > > + * @size - Input: Size of the reserved region > > > > > + * Return: 0 on success, -errno on failure */ struct > > > > > +vfio_iommu_reserved_region_del { > > > > > + __u32 argsz; > > > > > + __u32 flags; > > > > > +#define VFIO_IOMMU_RES_REGION_DEL (1 << 0) /* unset the > > > > reserved region */ > > > > > + __u64 iova; > > > > > + __u64 size; > > > > > +}; > > > > > +#define VFIO_IOMMU_RESERVED_REGION_DEL _IO(VFIO_TYPE, > > > > VFIO_BASE + 18) > > > > > + > > > > > > > > These are effectively both > > > > > > > > struct vfio_iommu_type1_dma_unmap > > > > > > Yes, do you want to suggest that we should use " struct > > vfio_iommu_type1_dma_unmap". I found that confusing. > > > What is we use "struct vfio_iommu_reserved_region" and use flag > > VFIO_IOMMU_RES_REGION_DEL/ VFIO_IOMMU_RES_REGION_ADD. > > > > What if we just use the existing map and unmap interface with a flag to > > indicate an MSI reserved mapping? I don't really see why we need new ioctls > > for this. > > Existing map/unmap APIs uses virtual address, user-space mmap() and provide virtual address as well. I think in previous discussion you commented that let's keep then separate, over-riding complicates the map/unmap and can be cause of confusion, and keeping these separate also gives space to extend these reserved iova APIs for something else in future (do not know for what right now). I personally think that it make sense to keep them separate, let me know if you think otherwise It seems pretty simple for a userspace app to know that the virtual address isn't used when doing a mapping with MSI_RESERVED set. The proposed API doesn't allow any room for extension because it's completely unspecified what the reserved region is intended for. Here we use them for MSI, but what else are we allowed to use them for? How would the user specify on reserved type vs another in a future extension? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html