Hi Alex, > -----Original Message----- > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx] > Sent: Thursday, December 07, 2017 4:08 PM > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@xxxxxxxxxx> > Cc: eric.auger@xxxxxxxxxx; pmorel@xxxxxxxxxxxxxxxxxx; > kvm@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Linuxarm > <linuxarm@xxxxxxxxxx> > Subject: Re: [RFC] vfio/type1: Add IOVA_RANGE capability support > > On Wed, 6 Dec 2017 16:07:36 +0000 > Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote: > > > This patch allows the user-space to retrieve the supported > > IOVA range(s), excluding any reserved regions. The implementation > > is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl. > > > > This is following the discussions here[1] and is based on the RFC patch[2]. > > > > ToDo: > > - This currently derives the default supported iova range from the first > > iommu domain. This needs to be changed to go through the domain_list > > instead. > > - Sync with Pierre's patch[3]. > > > > 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html > > 2.https://lists.linuxfoundation.org/pipermail/iommu/2016- > November/019002.html > > 3.https://patchwork.kernel.org/patch/10084655/ > > > > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> > > --- > > drivers/vfio/vfio_iommu_type1.c | 172 > +++++++++++++++++++++++++++++++++++++++- > > include/uapi/linux/vfio.h | 13 +++ > > 2 files changed, 184 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > b/drivers/vfio/vfio_iommu_type1.c > > index e30e29a..72ca78a 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -28,6 +28,7 @@ > > #include <linux/device.h> > > #include <linux/fs.h> > > #include <linux/iommu.h> > > +#include <linux/list_sort.h> > > #include <linux/module.h> > > #include <linux/mm.h> > > #include <linux/rbtree.h> > > @@ -92,6 +93,12 @@ struct vfio_group { > > struct list_head next; > > }; > > > > +struct vfio_iommu_iova { > > + struct list_head list; > > + phys_addr_t start; > > + phys_addr_t end; > > +}; > > + > > /* > > * Guest RAM pinning working set or DMA target > > */ > > @@ -1537,6 +1544,144 @@ static int > vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) > > return ret; > > } > > > > +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end) > > +{ > > + struct vfio_iommu_type1_info_cap_iova_range *cap; > > + struct vfio_info_cap_header *header; > > + > > + header = vfio_info_cap_add(caps, sizeof(*cap), > > + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1); > > + if (IS_ERR(header)) > > + return PTR_ERR(header); > > + > > + cap = container_of(header, > > + struct vfio_iommu_type1_info_cap_iova_range, > > + header); > > + > > + cap->start = start; > > + cap->end = end; > > + > > + return 0; > > +} > > + > > +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end, > > + struct list_head *head) > > +{ > > + struct vfio_iommu_iova *region; > > + > > + region = kzalloc(sizeof(*region), GFP_KERNEL); > > You're initializing every field, so a zero'd allocation is not > necessary here. Ok. > > + if (!region) > > + return -ENOMEM; > > + > > + INIT_LIST_HEAD(®ion->list); > > + region->start = start; > > + region->end = end; > > + > > + list_add_tail(®ion->list, head); > > + return 0; > > +} > > + > > +/* > > + * Check and update iova region list in case a reserved region > > + * overlaps the iommu iova range. > > + */ > > +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t > end, > > + struct list_head *iova) > > +{ > > + struct vfio_iommu_iova *node; > > + phys_addr_t a, b; > > + int ret = 0; > > + > > + if (list_empty(iova)) > > + return -ENODEV; > > + > > + node = list_last_entry(iova, struct vfio_iommu_iova, list); > > + a = node->start; > > + b = node->end; > > + > > + /* No overlap */ > > + if ((start > b) || (end < a)) > > + return 0; > > + > > + if (start > a) > > + ret = vfio_insert_iova(a, start - 1, &node->list); > > + if (ret) > > + goto done; > > + if (end < b) > > + ret = vfio_insert_iova(end + 1, b, &node->list); > > + > > +done: > > + list_del(&node->list); > > + kfree(node); > > + > > + return ret; > > +} > > + > > +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b) > > +{ > > + struct iommu_resv_region *ra, *rb; > > + > > + ra = container_of(a, struct iommu_resv_region, list); > > + rb = container_of(b, struct iommu_resv_region, list); > > + > > + if (ra->start < rb->start) > > + return -1; > > + if (ra->start > rb->start) > > + return 1; > > + return 0; > > +} > > + > > +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu, > > + struct vfio_info_cap *caps) > > +{ > > + struct iommu_resv_region *resv, *resv_next; > > + struct vfio_iommu_iova *iova, *iova_next; > > + struct list_head group_resv_regions, vfio_iova_regions; > > + struct vfio_domain *domain; > > + struct vfio_group *g; > > + phys_addr_t start, end; > > + int ret = 0; > > + > > + domain = list_first_entry(&iommu->domain_list, > > + struct vfio_domain, next); > > How do you resolve that we can have multiple domains in a container and > each my provide different apertures? Eric noted that the attach group > function attempts to do compatibility checks, so we need to figure out > how we determine IOVA apertures are compatible. The most obvious > answer seems to be that we should look through the dma_list on the > vfio_iommu and determine if there are existing mappings that are > incompatible with the new domain. That also suggests that we should > maintain and update a list of valid iova ranges as we go such that we > can reject mappings outside of those valid ranges. Right. I will go through this in detail and see how to accommodate this. > > + /* Get the default iova range supported */ > > + start = domain->domain->geometry.aperture_start; > > + end = domain->domain->geometry.aperture_end; > > There's an IOMMU API for this, iommu_domain_get_attr() with > DOMAIN_ATTR_GEOMETRY. > > > + INIT_LIST_HEAD(&vfio_iova_regions); > > + vfio_insert_iova(start, end, &vfio_iova_regions); > > + > > + /* Get reserved regions if any */ > > + INIT_LIST_HEAD(&group_resv_regions); > > + list_for_each_entry(g, &domain->group_list, next) > > + iommu_get_group_resv_regions(g->iommu_group, > > + &group_resv_regions); > > Reserved ranges also need to be accounted for as groups are added to a > domain. Again, if dma list includes any mappings overlapping reserved > ranges (ie. holes in the iova space), the group attach should fail. Ok. > > + list_sort(NULL, &group_resv_regions, vfio_resv_cmp); > > + > > + /* Update iova range excluding reserved regions */ > > + list_for_each_entry(resv, &group_resv_regions, list) { > > + ret = vfio_update_iommu_iova_range(resv->start, > > + resv->start + resv->length - 1, > > + &vfio_iova_regions); > > + if (ret) > > + goto done; > > + } > > + > > + list_for_each_entry(iova, &vfio_iova_regions, list) { > > + ret = vfio_add_iova_cap(caps, iova->start, iova->end); > > It seems like a fair bit of overhead and nuisance for the user to have > each iova range added as a separate capability. I think I'd prefer to > see the capability size be based on a number of entries field. Ok. I think the suggestion is to have something similar to the sparse mmap capability implementation. > > + if (ret) > > + goto done; > > + } > > + > > +done: > > + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) > > + kfree(resv); > > + > > + list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list) > > + kfree(iova); > > + > > + return ret; > > +} > > + > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > unsigned int cmd, unsigned long arg) > > { > > @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > } > > } else if (cmd == VFIO_IOMMU_GET_INFO) { > > struct vfio_iommu_type1_info info; > > + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; > > + int ret; > > > > - minsz = offsetofend(struct vfio_iommu_type1_info, > iova_pgsizes); > > + minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); > > This is incompatible, it will break existing userspace. See > include/uapi/linux/vfio.h: Right. > /* > * Callers of INFO ioctls passing insufficiently sized buffers will see > * the capability chain flag bit set, a zero value for the first capability > * offset (if available within the provided argsz), and argsz will be > * updated to report the necessary buffer size. For compatibility, the > * INFO ioctl will not report error in this case, but the capability chain > * will not be available. > */ > > > > > if (copy_from_user(&info, (void __user *)arg, minsz)) > > return -EFAULT; > > @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > > > info.iova_pgsizes = vfio_pgsize_bitmap(iommu); > > > > + ret = vfio_build_iommu_iova_caps(iommu, &caps); > > + if (ret) > > + return ret; > > + > > + if (caps.size) { > > + info.flags |= VFIO_IOMMU_INFO_CAPS; > > + if (info.argsz < sizeof(info) + caps.size) { > > + info.argsz = sizeof(info) + caps.size; > > + info.cap_offset = 0; > > + } else { > > + vfio_info_cap_shift(&caps, sizeof(info)); > > + if (copy_to_user((void __user *)arg + > > + sizeof(info), caps.buf, > > + caps.size)) { > > + kfree(caps.buf); > > + return -EFAULT; > > + } > > + info.cap_offset = sizeof(info); > > + } > > + > > + kfree(caps.buf); > > + } > > + > > return copy_to_user((void __user *)arg, &info, minsz) ? > > -EFAULT : 0; > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index e3301db..c4e338b 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info { > > __u32 argsz; > > __u32 flags; > > #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ > > +#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ > > __u64 iova_pgsizes; /* Bitmap of supported page sizes */ > > + __u32 cap_offset; /* Offset within info struct of first cap */ > > + __u32 __resv; > > Hmm, I'm not sure why the additional reserved field here is necessary. > I guess we need 8-byte alignment of this iova range capability, but > that should probably be accounted for explicitly as the capability is > constructed in the buffer rather than implicitly by the ending offset > of the static structure. Ok. > > +}; > > + > > +/* > > + * The IOVA_RANGE capability allows to report the IOVA range(s), > > + */ > > +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1 > > +struct vfio_iommu_type1_info_cap_iova_range { > > + struct vfio_info_cap_header header; > > + __u64 start; > > + __u64 end; > > }; > > > > It should be noted that this is version 1 of this structure. Ok. Will update that. Many thanks for going through this and the feedback. Cheers, Shameer