Hi Alex, > From: Alex Williamson < alex.williamson@xxxxxxxxxx > > Sent: Friday, July 3, 2020 2:39 AM > > On Wed, 24 Jun 2020 01:55:16 -0700 > Liu Yi L <yi.l.liu@xxxxxxxxx> wrote: > > > This patch exports iommu nesting capability info to user space through > > VFIO. User space is expected to check this info for supported uAPIs (e.g. > > PASID alloc/free, bind page table, and cache invalidation) and the > > vendor specific format information for first level/stage page table > > that will be bound to. > > > > The nesting info is available only after the nesting iommu type is set > > for a container. Current implementation imposes one limitation - one > > nesting container should include at most one group. The philosophy of > > vfio container is having all groups/devices within the container share > > the same IOMMU context. When vSVA is enabled, one IOMMU context could > > include one 2nd-level address space and multiple 1st-level address spaces. > > While the 2nd-leve address space is reasonably sharable by multiple > > groups , blindly sharing 1st-level address spaces across all groups > > within the container might instead break the guest expectation. In the > > future sub/ super container concept might be introduced to allow > > partial address space sharing within an IOMMU context. But for now > > let's go with this restriction by requiring singleton container for > > using nesting iommu features. Below link has the related discussion > > about this > > decision. > > > > https://lkml.org/lkml/2020/5/15/1028 > > > > Cc: Kevin Tian <kevin.tian@xxxxxxxxx> > > CC: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx> > > Cc: Alex Williamson <alex.williamson@xxxxxxxxxx> > > Cc: Eric Auger <eric.auger@xxxxxxxxxx> > > Cc: Jean-Philippe Brucker <jean-philippe@xxxxxxxxxx> > > Cc: Joerg Roedel <joro@xxxxxxxxxx> > > Cc: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx> > > Signed-off-by: Liu Yi L <yi.l.liu@xxxxxxxxx> > > --- > > drivers/vfio/vfio_iommu_type1.c | 73 > +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 9 +++++ > > 2 files changed, 82 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index 7accb59..8c143d5 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -72,6 +72,7 @@ struct vfio_iommu { > > uint64_t pgsize_bitmap; > > bool v2; > > bool nesting; > > + struct iommu_nesting_info *nesting_info; > > bool dirty_page_tracking; > > bool pinned_page_dirty_scope; > > }; > > Mind the structure packing and alignment, placing a pointer in the middle > of a > section of bools is going to create wasteful holes in the data structure. how about below? Add the @nesting_info and @vmm in the end of this struct. I've two questions, the first one is how the place the comment of the @external_domain; second question is do you want me to move the @nesting field to be near-by with the @nesting_info and @vmm. :) please let me know your preference. struct vfio_iommu { struct list_head domain_list; struct list_head iova_list; struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list; struct blocking_notifier_head notifier; unsigned int dma_avail; uint64_t pgsize_bitmap; bool v2; bool nesting; bool dirty_page_tracking; bool pinned_page_dirty_scope; struct iommu_nesting_info *nesting_info; struct vfio_mm *vmm; }; > > @@ -130,6 +131,9 @@ struct vfio_regions { > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > (!list_empty(&iommu->domain_list)) > > > > +#define IS_DOMAIN_IN_CONTAINER(iommu) ((iommu- > >external_domain) || \ > > + (!list_empty(&iommu->domain_list))) > > + > > #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / > BITS_PER_BYTE) > > > > /* > > @@ -1959,6 +1963,12 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > } > > } > > > > + /* Nesting type container can include only one group */ > > + if (iommu->nesting && IS_DOMAIN_IN_CONTAINER(iommu)) { > > + mutex_unlock(&iommu->lock); > > + return -EINVAL; > > + } > > + > > group = kzalloc(sizeof(*group), GFP_KERNEL); > > domain = kzalloc(sizeof(*domain), GFP_KERNEL); > > if (!group || !domain) { > > @@ -2029,6 +2039,36 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > if (ret) > > goto out_domain; > > > > + /* Nesting cap info is available only after attaching */ > > + if (iommu->nesting) { > > + struct iommu_nesting_info tmp; > > + struct iommu_nesting_info *info; > > + > > + /* First get the size of vendor specific nesting info */ > > + ret = iommu_domain_get_attr(domain->domain, > > + DOMAIN_ATTR_NESTING, > > + &tmp); > > + if (ret) > > + goto out_detach; > > + > > + info = kzalloc(tmp.size, GFP_KERNEL); > > + if (!info) { > > + ret = -ENOMEM; > > + goto out_detach; > > + } > > + > > + /* Now get the nesting info */ > > + info->size = tmp.size; > > + ret = iommu_domain_get_attr(domain->domain, > > + DOMAIN_ATTR_NESTING, > > + info); > > + if (ret) { > > + kfree(info); > > + goto out_detach; > > + } > > + iommu->nesting_info = info; > > + } > > + > > /* Get aperture info */ > > iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, > &geo); > > > > @@ -2138,6 +2178,7 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > return 0; > > > > out_detach: > > + kfree(iommu->nesting_info); > > This looks prone to a use-after-free. how about setting iommu->nesting_info to NULL? just as the next comment from you. > > vfio_iommu_detach_group(domain, group); > > out_domain: > > iommu_domain_free(domain->domain); > > @@ -2338,6 +2379,8 @@ static void vfio_iommu_type1_detach_group(void > *iommu_data, > > vfio_iommu_unmap_unpin_all(iommu); > > else > > > vfio_iommu_unmap_unpin_reaccount(iommu); > > + > > + kfree(iommu->nesting_info); > > As does this. Set to NULL since get_info tests the pointer before trying to > use it. got it. > > } > > iommu_domain_free(domain->domain); > > list_del(&domain->next); > > @@ -2546,6 +2589,30 @@ static int vfio_iommu_migration_build_caps(struct > vfio_iommu *iommu, > > return vfio_info_add_capability(caps, &cap_mig.header, > > sizeof(cap_mig)); } > > > > +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > > + struct vfio_info_cap *caps) > > +{ > > + struct vfio_info_cap_header *header; > > + struct vfio_iommu_type1_info_cap_nesting *nesting_cap; > > + size_t size; > > + > > + size = sizeof(*nesting_cap) + iommu->nesting_info->size; > > + > > + header = vfio_info_cap_add(caps, size, > > + VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1); > > + if (IS_ERR(header)) > > + return PTR_ERR(header); > > + > > + nesting_cap = container_of(header, > > + struct vfio_iommu_type1_info_cap_nesting, > > + header); > > + > > + memcpy(&nesting_cap->info, iommu->nesting_info, > > + iommu->nesting_info->size); > > + > > + return 0; > > +} > > + > > static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, > > unsigned long arg) > > { > > @@ -2586,6 +2653,12 @@ static int vfio_iommu_type1_get_info(struct > vfio_iommu *iommu, > > if (ret) > > return ret; > > > > + if (iommu->nesting_info) { > > + ret = vfio_iommu_info_add_nesting_cap(iommu, &caps); > > + if (ret) > > + return ret; > > + } > > + > > if (caps.size) { > > info.flags |= VFIO_IOMMU_INFO_CAPS; > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index eca66926..f1f39e1 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -14,6 +14,7 @@ > > > > #include <linux/types.h> > > #include <linux/ioctl.h> > > +#include <linux/iommu.h> > > Why? We're not directly referencing any IOMMU UAPI structures here. oh, yes. will remove it. > > > > #define VFIO_API_VERSION 0 > > > > @@ -1039,6 +1040,14 @@ struct vfio_iommu_type1_info_cap_migration { > > __u64 max_dirty_bitmap_size; /* in bytes */ > > }; > > > > +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING 3 > > + > > +struct vfio_iommu_type1_info_cap_nesting { > > + struct vfio_info_cap_header header; > > + __u32 flags; > > I think there's an alignment issue here for a uapi. The header field is > 8-bytes total > and info[] should start at an 8-byte alignment to allow data[] within info > to have > 8-byte alignment. This could lead to the structure having a compiler > dependent > size and offsets. We should add a 4-byte reserved field here to resolve. got it. or how about defining the flags as __u64? > > > + __u8 info[]; > > +}; > > This should have a lot more description around it, a user could not infer > that info[] > is including a struct iommu_nesting_info from the information provided here. > Thanks, sure. BTW. do you think it is necessary to add a flag to indicate the info[] is a struct iommu_nesting_info? or as a start, it's not necessary to do it. Regards, Yi Liu > Alex > > > + > > #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > > > > /**