RE: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Eric,

> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@xxxxxxxxxx]
> Sent: Tuesday, January 23, 2018 8:25 AM
> To: Alex Williamson <alex.williamson@xxxxxxxxxx>; Shameerali Kolothum
> Thodi <shameerali.kolothum.thodi@xxxxxxxxxx>
> Cc: pmorel@xxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; linux-
> kernel@xxxxxxxxxxxxxxx; Linuxarm <linuxarm@xxxxxxxxxx>; John Garry
> <john.garry@xxxxxxxxxx>; xuwei (O) <xuwei5@xxxxxxxxxx>
> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu
> aperture validity check
> 
> Hi Shameer,
> 
> On 18/01/18 01:04, Alex Williamson wrote:
> > On Fri, 12 Jan 2018 16:45:27 +0000
> > Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote:
> >
> >> This introduces an iova list that is valid for dma mappings. Make
> >> sure the new iommu aperture window is valid and doesn't conflict
> >> with any existing dma mappings during attach. Also update the iova
> >> list with new aperture window during attach/detach.
> >>
> >> Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@xxxxxxxxxx>
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 177
> ++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 177 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> >> index e30e29a..11cbd49 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -60,6 +60,7 @@
> >>
> >>  struct vfio_iommu {
> >>  	struct list_head	domain_list;
> >> +	struct list_head	iova_list;
> >>  	struct vfio_domain	*external_domain; /* domain for external user
> */
> >>  	struct mutex		lock;
> >>  	struct rb_root		dma_list;
> >> @@ -92,6 +93,12 @@ struct vfio_group {
> >>  	struct list_head	next;
> >>  };
> >>
> >> +struct vfio_iova {
> >> +	struct list_head	list;
> >> +	phys_addr_t		start;
> >> +	phys_addr_t		end;
> >> +};
> >
> > dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
> > addresses, why are we using phys_addr_t?
> >
> >> +
> >>  /*
> >>   * Guest RAM pinning working set or DMA target
> >>   */
> >> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct
> iommu_group *group, phys_addr_t *base)
> >>  	return ret;
> >>  }
> >>
> >> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> >> +				struct list_head *head)
> >> +{
> >> +	struct vfio_iova *region;
> >> +
> >> +	region = kmalloc(sizeof(*region), GFP_KERNEL);
> >> +	if (!region)
> >> +		return -ENOMEM;
> >> +
> >> +	INIT_LIST_HEAD(&region->list);
> >> +	region->start = start;
> >> +	region->end = end;
> >> +
> >> +	list_add_tail(&region->list, head);
> >> +	return 0;
> >> +}
> >
> > As I'm reading through this series, I'm learning that there are a lot
> > of assumptions and subtle details that should be documented.  For
> > instance, the IOMMU API only provides a single geometry and we build
> > upon that here as this patch creates a list, but there's only a single
> > entry for now.  The following patches carve that single iova range into
> > pieces and somewhat subtly use the list_head passed to keep the list
> > sorted, allowing the first/last_entry tricks used throughout.  Subtle
> > interfaces are prone to bugs.
> >
> >> +
> >> +/*
> >> + * Find whether a mem region overlaps with existing dma mappings
> >> + */
> >> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
> >> +				  phys_addr_t start, phys_addr_t end)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma;
> >> +
> >> +		dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		if (end < dma->iova)
> >> +			break;
> >> +		if (start >= dma->iova + dma->size)
> >> +			continue;
> >> +		return true;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >
> > Why do we need this in addition to the existing vfio_find_dma()?  Why
> > doesn't this use the tree structure of the dma_list?
> >
> >> +
> >> +/*
> >> + * Check the new iommu aperture is a valid one
> >> + */
> >> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
> >> +				     phys_addr_t start,
> >> +				     phys_addr_t end)
> >> +{
> >> +	struct vfio_iova *first, *last;
> >> +	struct list_head *iova = &iommu->iova_list;
> >> +
> >> +	if (list_empty(iova))
> >> +		return 0;
> >> +
> >> +	/* Check if new one is outside the current aperture */
> >
> > "Disjoint sets"
> >
> >> +	first = list_first_entry(iova, struct vfio_iova, list);
> >> +	last = list_last_entry(iova, struct vfio_iova, list);
> >> +	if ((start > last->end) || (end < first->start))
> >> +		return -EINVAL;
> >> +
> >> +	/* Check for any existing dma mappings outside the new start */
> >> +	if (start > first->start) {
> >> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
> >> +			return -EINVAL;
> >> +	}
> >> +
> >> +	/* Check for any existing dma mappings outside the new end */
> >> +	if (end < last->end) {
> >> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
> >> +			return -EINVAL;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >
> > I think this returns an int because you want to use it for the return
> > value below, but it really seems like a bool question, ie. does this
> > aperture conflict with existing mappings.  Additionally, the aperture
> > is valid, it was provided to us by the IOMMU API, the question is
> > whether it conflicts.  Please also name consistently to the other
> > functions in this patch, vfio_iommu_aper_xxxx().
> >
> >> +
> >> +/*
> >> + * Adjust the iommu aperture window if new aperture is a valid one
> >> + */
> >> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> >> +				      phys_addr_t start,
> >> +				      phys_addr_t end)
> >
> > Perhaps "resize", "prune", or "shrink" to make it more clear what is
> > being adjusted?
> >
> >> +{
> >> +	struct vfio_iova *node, *next;
> >> +	struct list_head *iova = &iommu->iova_list;
> >> +
> >> +	if (list_empty(iova))
> >> +		return vfio_insert_iova(start, end, iova);
> >> +
> >> +	/* Adjust iova list start */
> >> +	list_for_each_entry_safe(node, next, iova, list) {
> >> +		if (start < node->start)
> >> +			break;
> >> +		if ((start >= node->start) && (start <= node->end)) {
> >
> > start == node->end results in a zero sized node.  s/<=/</
> >
> >> +			node->start = start;
> >> +			break;
> >> +		}
> >> +		/* Delete nodes before new start */
> >> +		list_del(&node->list);
> >> +		kfree(node);
> >> +	}
> >> +
> >> +	/* Adjust iova list end */
> >> +	list_for_each_entry_safe(node, next, iova, list) {
> >> +		if (end > node->end)
> >> +			continue;
> >> +
> >> +		if ((end >= node->start) && (end <= node->end)) {
> >
> > end == node->start results in a zero sized node.  s/>=/>/
> >
> >> +			node->end = end;
> >> +			continue;
> >> +		}
> >> +		/* Delete nodes after new end */
> >> +		list_del(&node->list);
> >> +		kfree(node);
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  					 struct iommu_group *iommu_group)
> >>  {
> >> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	int ret;
> >>  	bool resv_msi, msi_remap;
> >>  	phys_addr_t resv_msi_base;
> >> +	struct iommu_domain_geometry geo;
> >>
> >>  	mutex_lock(&iommu->lock);
> >>
> >> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	if (ret)
> >>  		goto out_domain;
> >>
> >> +	/* Get aperture info */
> >> +	iommu_domain_get_attr(domain->domain,
> DOMAIN_ATTR_GEOMETRY, &geo);
> >> +
> >> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
> >> +					geo.aperture_end);
> >> +	if (ret)
> >> +		goto out_detach;
> >> +
> >>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
> >>
> >>  	INIT_LIST_HEAD(&domain->group_list);
> >> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  			goto out_detach;
> >>  	}
> >>
> >> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
> >> +					  geo.aperture_end);
> >> +	if (ret)
> >> +		goto out_detach;
> >> +
> >>  	list_add(&domain->next, &iommu->domain_list);
> >>
> >>  	mutex_unlock(&iommu->lock);
> >> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct
> vfio_iommu *iommu)
> >>  	WARN_ON(iommu->notifier.head);
> >>  }
> >>
> >> +/*
> >> + * Called when a domain is removed in detach. It is possible that
> >> + * the removed domain decided the iova aperture window. Modify the
> >> + * iova aperture with the smallest window among existing domains.
> >> + */
> >> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
> >> +{
> >> +	struct vfio_domain *domain;
> >> +	struct iommu_domain_geometry geo;
> >> +	struct vfio_iova *node;
> >> +	phys_addr_t start = 0;
> >> +	phys_addr_t end = (phys_addr_t)~0;
> >> +
> >> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> >> +		iommu_domain_get_attr(domain->domain,
> DOMAIN_ATTR_GEOMETRY,
> >> +				      &geo);
> >> +			if (geo.aperture_start > start)
> >> +				start = geo.aperture_start;
> >> +			if (geo.aperture_end < end)
> >> +				end = geo.aperture_end;
> >> +	}
> >> +
> >> +	/* modify iova aperture limits */
> >> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> >> +	node->start = start;
> >> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> >> +	node->end = end;
> >
> > We can do this because the new aperture is the same or bigger than the
> > current aperture, never smaller.  That's not fully obvious and should
> > be noted in the comment.  Perhaps this function should be "expand"
> > rather than "refresh".
> This one is not obvious to me either:
> assuming you have 2 domains, resp with aperture 1 and 2, resulting into
> aperture 3. Holes are created by resv regions for instance. If you
> remove domain 1, don't you get 4) instead of 2)?
> 
> 1)   |------------|
>  +
> 2) |---|    |--|       |-----|
> =
> 3)   |-|    |--|
> 
> 
> 4) |---|    |----------------|

That is true partially. But please remember that this patch is not aware of
any reserved regions yet. That is introduced in patch #2. So patch #1 and #2
together, the iova aperture might looks like 4) after this function call and once 
vfio_iommu_iova_resv_refresh() in patch #2 is done, the aperture will be
back to 2).

Hope I am clear. Please let me know.

In any case, based on comments by Alex, I will be removing this aperture/reserve
refresh functions and leave the iova list as it is when a group is detached. 

Thanks,
Shameer

> Thanks
> 
> Eric
> >
> >> +}
> >> +
> >>  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>  					  struct iommu_group *iommu_group)
> >>  {
> >> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
> >>  			iommu_domain_free(domain->domain);
> >>  			list_del(&domain->next);
> >>  			kfree(domain);
> >> +			vfio_iommu_iova_aper_refresh(iommu);
> >>  		}
> >>  		break;
> >>  	}
> >> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned
> long arg)
> >>  	}
> >>
> >>  	INIT_LIST_HEAD(&iommu->domain_list);
> >> +	INIT_LIST_HEAD(&iommu->iova_list);
> >>  	iommu->dma_list = RB_ROOT;
> >>  	mutex_init(&iommu->lock);
> >>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> >> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void
> *iommu_data)
> >>  {
> >>  	struct vfio_iommu *iommu = iommu_data;
> >>  	struct vfio_domain *domain, *domain_tmp;
> >> +	struct vfio_iova *iova, *iova_tmp;
> >>
> >>  	if (iommu->external_domain) {
> >>  		vfio_release_domain(iommu->external_domain, true);
> >> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void
> *iommu_data)
> >>  		list_del(&domain->next);
> >>  		kfree(domain);
> >>  	}
> >> +
> >> +	list_for_each_entry_safe(iova, iova_tmp,
> >> +				 &iommu->iova_list, list) {
> >> +		list_del(&iova->list);
> >> +		kfree(iova);
> >> +	}
> >> +
> >>  	kfree(iommu);
> >>  }
> >>
> >



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux