Hi Eric, > From: Auger Eric <eric.auger@xxxxxxxxxx> > Sent: Thursday, August 13, 2020 9:20 PM > > Hi Yi, > On 7/28/20 8:27 AM, Liu Yi L wrote: > > This patch exports iommu nesting capability info to user space through > > VFIO. Userspace is expected to check this info for supported uAPIs (e.g. > > PASID alloc/free, bind page table, and cache invalidation) and the > > vendor specific format information for first level/stage page table > > that will be bound to. > > > > The nesting info is available only after container set to be NESTED type. > > Current implementation imposes one limitation - one nesting container > > should include at most one iommu group. The philosophy of vfio > > container is having all groups/devices within the container share the > > same IOMMU context. When vSVA is enabled, one IOMMU context could > > include one 2nd- level address space and multiple 1st-level address > > spaces. While the 2nd-level address space is reasonably sharable by > > multiple groups, blindly sharing 1st-level address spaces across all > > groups within the container might instead break the guest expectation. > > In the future sub/super container concept might be introduced to allow > > partial address space sharing within an IOMMU context. But for now > > let's go with this restriction by requiring singleton container for > > using nesting iommu features. Below link has the related discussion about this > decision. > > > > https://lore.kernel.org/kvm/20200515115924.37e6996d@xxxxxxxxx/ > > > > This patch also changes the NESTING type container behaviour. > > Something that would have succeeded before will now fail: Before this > > series, if user asked for a VFIO_IOMMU_TYPE1_NESTING, it would have > > succeeded even if the SMMU didn't support stage-2, as the driver would > > have silently fallen back on stage-1 mappings (which work exactly the > > same as stage-2 only since there was no nesting supported). After the > > series, we do check for DOMAIN_ATTR_NESTING so if user asks for > > VFIO_IOMMU_TYPE1_NESTING and the SMMU doesn't support stage-2, the > > ioctl fails. But it should be a good fix and completely harmless. Detail can be found > in below link as well. > > > > https://lore.kernel.org/kvm/20200717090900.GC4850@myrica/ > > > > Cc: Kevin Tian <kevin.tian@xxxxxxxxx> > > CC: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx> > > Cc: Alex Williamson <alex.williamson@xxxxxxxxxx> > > Cc: Eric Auger <eric.auger@xxxxxxxxxx> > > Cc: Jean-Philippe Brucker <jean-philippe@xxxxxxxxxx> > > Cc: Joerg Roedel <joro@xxxxxxxxxx> > > Cc: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx> > > Signed-off-by: Liu Yi L <yi.l.liu@xxxxxxxxx> > > --- > > v5 -> v6: > > *) address comments against v5 from Eric Auger. > > *) don't report nesting cap to userspace if the nesting_info->format is > > invalid. > > > > v4 -> v5: > > *) address comments from Eric Auger. > > *) return struct iommu_nesting_info for > VFIO_IOMMU_TYPE1_INFO_CAP_NESTING as > > cap is much "cheap", if needs extension in future, just define another cap. > > https://lore.kernel.org/kvm/20200708132947.5b7ee954@xxxxxxx/ > > > > v3 -> v4: > > *) address comments against v3. > > > > v1 -> v2: > > *) added in v2 > > --- > > drivers/vfio/vfio_iommu_type1.c | 106 > +++++++++++++++++++++++++++++++++++----- > > include/uapi/linux/vfio.h | 19 +++++++ > > 2 files changed, 113 insertions(+), 12 deletions(-) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index 3bd70ff..18ff0c3 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -62,18 +62,20 @@ MODULE_PARM_DESC(dma_entry_limit, > > "Maximum number of user DMA mappings per container (65535)."); > > > > struct vfio_iommu { > > - struct list_head domain_list; > > - struct list_head iova_list; > > - struct vfio_domain *external_domain; /* domain for external user */ > > - struct mutex lock; > > - struct rb_root dma_list; > > - struct blocking_notifier_head notifier; > > - unsigned int dma_avail; > > - uint64_t pgsize_bitmap; > > - bool v2; > > - bool nesting; > > - bool dirty_page_tracking; > > - bool pinned_page_dirty_scope; > > + struct list_head domain_list; > > + struct list_head iova_list; > > + /* domain for external user */ > > + struct vfio_domain *external_domain; > > + struct mutex lock; > > + struct rb_root dma_list; > > + struct blocking_notifier_head notifier; > > + unsigned int dma_avail; > > + uint64_t pgsize_bitmap; > > + bool v2; > > + bool nesting; > > + bool dirty_page_tracking; > > + bool pinned_page_dirty_scope; > > + struct iommu_nesting_info *nesting_info; > > }; > > > > struct vfio_domain { > > @@ -130,6 +132,9 @@ struct vfio_regions { > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > (!list_empty(&iommu->domain_list)) > > > > +#define CONTAINER_HAS_DOMAIN(iommu) (((iommu)->external_domain) || \ > > + (!list_empty(&(iommu)->domain_list))) > > + > > #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / > BITS_PER_BYTE) > > > > /* > > @@ -1929,6 +1934,13 @@ static void vfio_iommu_iova_insert_copy(struct > > vfio_iommu *iommu, > > > > list_splice_tail(iova_copy, iova); > > } > > + > > +static void vfio_iommu_release_nesting_info(struct vfio_iommu *iommu) > > +{ > > + kfree(iommu->nesting_info); > > + iommu->nesting_info = NULL; > > +} > > + > > static int vfio_iommu_type1_attach_group(void *iommu_data, > > struct iommu_group *iommu_group) > { @@ -1959,6 +1971,12 @@ > > static int vfio_iommu_type1_attach_group(void *iommu_data, > > } > > } > > > > + /* Nesting type container can include only one group */ > > + if (iommu->nesting && CONTAINER_HAS_DOMAIN(iommu)) { > > + mutex_unlock(&iommu->lock); > > + return -EINVAL; > > + } > > + > > group = kzalloc(sizeof(*group), GFP_KERNEL); > > domain = kzalloc(sizeof(*domain), GFP_KERNEL); > > if (!group || !domain) { > > @@ -2029,6 +2047,32 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > if (ret) > > goto out_domain; > > > > + /* Nesting cap info is available only after attaching */ > > + if (iommu->nesting) { > > + struct iommu_nesting_info tmp = { .argsz = 0, }; > > + > > + /* First get the size of vendor specific nesting info */ > > + ret = iommu_domain_get_attr(domain->domain, > > + DOMAIN_ATTR_NESTING, > > + &tmp); > > + if (ret) > > + goto out_detach; > > + > > + iommu->nesting_info = kzalloc(tmp.argsz, GFP_KERNEL); > > + if (!iommu->nesting_info) { > > + ret = -ENOMEM; > > + goto out_detach; > > + } > > + > > + /* Now get the nesting info */ > > + iommu->nesting_info->argsz = tmp.argsz; > > + ret = iommu_domain_get_attr(domain->domain, > > + DOMAIN_ATTR_NESTING, > > + iommu->nesting_info); > > + if (ret) > > + goto out_detach; > > + } > > + > > /* Get aperture info */ > > iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, > &geo); > > > > @@ -2138,6 +2182,7 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > return 0; > > > > out_detach: > > + vfio_iommu_release_nesting_info(iommu); > > vfio_iommu_detach_group(domain, group); > > out_domain: > > iommu_domain_free(domain->domain); > > @@ -2338,6 +2383,8 @@ static void vfio_iommu_type1_detach_group(void > *iommu_data, > > vfio_iommu_unmap_unpin_all(iommu); > > else > > > vfio_iommu_unmap_unpin_reaccount(iommu); > > + > > + vfio_iommu_release_nesting_info(iommu); > > } > > iommu_domain_free(domain->domain); > > list_del(&domain->next); > > @@ -2546,6 +2593,39 @@ static int vfio_iommu_migration_build_caps(struct > vfio_iommu *iommu, > > return vfio_info_add_capability(caps, &cap_mig.header, > > sizeof(cap_mig)); } > > > > +static int vfio_iommu_add_nesting_cap(struct vfio_iommu *iommu, > > + struct vfio_info_cap *caps) { > > + struct vfio_info_cap_header *header; > > + struct vfio_iommu_type1_info_cap_nesting *nesting_cap; > > + size_t size; > > + > > + /* when nesting_info is null, no need go further */ > no need to go nice catch. > > + if (!iommu->nesting_info) > > + return 0; > > + > > + /* when @format of nesting_info is 0, fail the call */ > > + if (iommu->nesting_info->format == 0) > > + return -ENOENT; > > + > > + size = offsetof(struct vfio_iommu_type1_info_cap_nesting, info) + > > + iommu->nesting_info->argsz; > > + > > + header = vfio_info_cap_add(caps, size, > > + VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1); > > + if (IS_ERR(header)) > > + return PTR_ERR(header); > > + > > + nesting_cap = container_of(header, > > + struct vfio_iommu_type1_info_cap_nesting, > > + header); > > + > > + memcpy(&nesting_cap->info, iommu->nesting_info, > > + iommu->nesting_info->argsz); > can't you use vfio_info_add_capability() directly? yes, the below lines will be covered by vfio_info_add_capability(). + header = vfio_info_cap_add(caps, size, + VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + nesting_cap = container_of(header, + struct vfio_iommu_type1_info_cap_nesting, + header); > > + > > + return 0; > > +} > > + > > static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, > > unsigned long arg) > > { > > @@ -2581,6 +2661,8 @@ static int vfio_iommu_type1_get_info(struct > vfio_iommu *iommu, > > if (!ret) > > ret = vfio_iommu_iova_build_caps(iommu, &caps); > > > > + ret = vfio_iommu_add_nesting_cap(iommu, &caps); > > + > > mutex_unlock(&iommu->lock); > > > > if (ret) > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index 9204705..0cf3d6d 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -14,6 +14,7 @@ > > > > #include <linux/types.h> > > #include <linux/ioctl.h> > > +#include <linux/iommu.h> > > > > #define VFIO_API_VERSION 0 > > > > @@ -1039,6 +1040,24 @@ struct vfio_iommu_type1_info_cap_migration { > > __u64 max_dirty_bitmap_size; /* in bytes */ > > }; > > > > +/* > > + * The nesting capability allows to report the related capability > > + * and info for nesting iommu type. > > + * > > + * The structures below define version 1 of this capability. > > + * > > + * Userspace selected VFIO_TYPE1_NESTING_IOMMU type should check > > + * this capability to get supported features. > nested capabilities should be checked by the userspace after setting > VFIO_TYPE1_NESTING_IOMMU? yes, will modify it. Regards, Yi Liu > > + * > > + * @info: the nesting info provided by IOMMU driver. > > + */ > > +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING 3 > > + > > +struct vfio_iommu_type1_info_cap_nesting { > > + struct vfio_info_cap_header header; > > + struct iommu_nesting_info info; > > +}; > > + > > #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > > > > /** > > > Thanks > > Eric