Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list

Alex Williamson <alex.williamson@xxxxxxxxxx> · Wed, 17 Jan 2018 17:04:39 -0700

On Fri, 12 Jan 2018 16:45:28 +0000
Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote:

> This retrieves the reserved regions associated with dev group and
> checks for conflicts with any existing dma mappings. Also update
> the iova list excluding the reserved regions.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 159 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 11cbd49..7609070 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -28,6 +28,7 @@
>  #include <linux/device.h>
>  #include <linux/fs.h>
>  #include <linux/iommu.h>
> +#include <linux/list_sort.h>
>  #include <linux/module.h>
>  #include <linux/mm.h>
>  #include <linux/rbtree.h>
> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>  	return ret;
>  }
>  

/* list_sort helper */

> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> +	struct iommu_resv_region *ra, *rb;
> +
> +	ra = container_of(a, struct iommu_resv_region, list);
> +	rb = container_of(b, struct iommu_resv_region, list);
> +
> +	if (ra->start < rb->start)
> +		return -1;
> +	if (ra->start > rb->start)
> +		return 1;
> +	return 0;
> +}
> +
>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>  				struct list_head *head)
>  {
> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>  }
>  
>  /*
> + * Check reserved region conflicts with existing dma mappings
> + */
> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
> +				struct list_head *resv_regions)
> +{
> +	struct iommu_resv_region *region;
> +
> +	/* Check for conflict with existing dma mappings */
> +	list_for_each_entry(region, resv_regions, list) {
> +		if (vfio_find_dma_overlap(iommu, region->start,
> +				    region->start + region->length - 1))
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}

This basically does the same test as vfio_iommu_valid_aperture but
properly names it a conflict test.  Please be consistent.  Should this
also return bool, "conflict" is a yes/no answer.

> +
> +/*
>   * Adjust the iommu aperture window if new aperture is a valid one
>   */
>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>  	return 0;
>  }
>  
> +/*
> + * Check and update iova region list in case a reserved region
> + * overlaps the iommu iova range
> + */
> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
> +					struct list_head *resv_regions)

"resv_region" in previous function, just "resv" here, use consistent
names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

> +{
> +	struct iommu_resv_region *resv;
> +	struct list_head *iova = &iommu->iova_list;
> +	struct vfio_iova *n, *next;
> +
> +	list_for_each_entry(resv, resv_regions, list) {
> +		phys_addr_t start, end;
> +
> +		start = resv->start;
> +		end = resv->start + resv->length - 1;
> +
> +		list_for_each_entry_safe(n, next, iova, list) {
> +			phys_addr_t a, b;
> +			int ret = 0;
> +
> +			a = n->start;
> +			b = n->end;

'a' and 'b' variables actually make this incredibly confusing.  Use
better variable names or just drop them entirely, it's much easier to
follow as n->start & n->end.

> +			/* No overlap */
> +			if ((start > b) || (end < a))
> +				continue;
> +			/* Split the current node and create holes */
> +			if (start > a)
> +				ret = vfio_insert_iova(a, start - 1, &n->list);
> +			if (!ret && end < b)
> +				ret = vfio_insert_iova(end + 1, b, &n->list);
> +			if (ret)
> +				return ret;
> +
> +			list_del(&n->list);

This is trickier than it appears and deserves some explanation.  AIUI,
we're actually inserting duplicate entries for the remainder at the
start of the range and then at the end of the range (and the order is
important here because we're inserting each before the current node),
and then we delete the current node.  So the iova_list is kept sorted
through this process, though temporarily includes some bogus, unordered
sub-sets.

> +			kfree(n);
> +		}
> +	}
> +
> +	if (list_empty(iova))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	bool resv_msi, msi_remap;
>  	phys_addr_t resv_msi_base;
>  	struct iommu_domain_geometry geo;
> +	struct list_head group_resv_regions;
> +	struct iommu_resv_region *resv, *resv_next;
>  
>  	mutex_lock(&iommu->lock);
>  
> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_detach;
>  
> +	INIT_LIST_HEAD(&group_resv_regions);
> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> +
> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
> +	if (ret)
> +		goto out_detach;
> +
>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>  
>  	INIT_LIST_HEAD(&domain->group_list);
> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		    d->prot == domain->prot) {
>  			iommu_detach_group(domain->domain, iommu_group);
>  			if (!iommu_attach_group(d->domain, iommu_group)) {
> +				ret = vfio_iommu_iova_resv_adjust(iommu,
> +							&group_resv_regions);
> +				if (!ret)
> +					goto out_domain;

The above function is not without side effects if it fails, it's
altered the iova_list.  It needs to be valid for the remaining domains
if we're going to continue.

> +
>  				list_add(&group->next, &d->group_list);
>  				iommu_domain_free(domain->domain);
>  				kfree(domain);
> -				mutex_unlock(&iommu->lock);
> -				return 0;
> +				goto done;
>  			}
>  
>  			ret = iommu_attach_group(domain->domain, iommu_group);
> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_detach;
>  
> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
> +	if (ret)
> +		goto out_detach;

Can't we process the reserved regions once before we get here rather
than have two separate call points that do the same thing?  In order to
roll back from errors above, it seems like we need to copy iova_list
and work on the copy, installing it and deleting the original only on
success.

> +
>  	list_add(&domain->next, &iommu->domain_list);
>  
> +done:
> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> +		kfree(resv);
>  	mutex_unlock(&iommu->lock);
>  
>  	return 0;
> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	iommu_detach_group(domain->domain, iommu_group);
>  out_domain:
>  	iommu_domain_free(domain->domain);
> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> +		kfree(resv);
>  out_free:
>  	kfree(domain);
>  	kfree(group);
> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>  	node->end = end;
>  }
>  
> +/*
> + * Called when a group is detached. The reserved regions for that
> + * group can be part of valid iova now. But since reserved regions
> + * may be duplicated among groups, populate the iova valid regions
> +   list again.
> + */
> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *d;
> +	struct vfio_group *g;
> +	struct vfio_iova *node, *tmp;
> +	struct iommu_resv_region *resv, *resv_next;
> +	struct list_head resv_regions;
> +	phys_addr_t start, end;
> +
> +	INIT_LIST_HEAD(&resv_regions);
> +
> +	list_for_each_entry(d, &iommu->domain_list, next) {
> +		list_for_each_entry(g, &d->group_list, next)
> +			iommu_get_group_resv_regions(g->iommu_group,
> +							 &resv_regions);
> +	}
> +
> +	if (list_empty(&resv_regions))
> +		return;
> +
> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
> +
> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> +	start = node->start;
> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> +	end = node->end;

list_sort() only sorts based on ->start, we added reserved regions for
all our groups to one list, we potentially have multiple entries with
the same ->start.  How can we be sure that the last one in the list
actually has the largest ->end value?

> +
> +	/* purge the iova list and create new one */
> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
> +		list_del(&node->list);
> +		kfree(node);
> +	}
> +
> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",
> +			__func__);

Map requests "will" fail.  Is this the right error strategy?  Detaching
a group cannot fail.  Aren't we better off leaving the iova_list we had
in place?  If we cannot expand the iova aperture when a group is
removed, a user can continue unscathed.

> +		goto done;
> +	}
> +
> +	/* adjust the iova with current reserved regions */
> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
> +		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",
> +			__func__);

Same.

> +done:
> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
> +		kfree(resv);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		break;
>  	}
>  
> +	vfio_iommu_iova_resv_refresh(iommu);
> +
>  detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }