Re: [PATCH V1 5/5] vfio: block during VA suspend

Alex Williamson <alex.williamson@xxxxxxxxxx> · Fri, 8 Jan 2021 14:32:29 -0700

On Tue,  5 Jan 2021 07:36:53 -0800
Steve Sistare <steven.sistare@xxxxxxxxxx> wrote:

> Block translation of host virtual address while an iova range is suspended.
> 
> Signed-off-by: Steve Sistare <steven.sistare@xxxxxxxxxx>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 48 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 45 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2c164a6..8035b9a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -31,6 +31,8 @@
>  #include <linux/rbtree.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/mm.h>
> +#include <linux/delay.h>
> +#include <linux/kthread.h>
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
> @@ -484,6 +486,34 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>  	return ret;
>  }
>  
> +static bool vfio_iommu_contained(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *domain = iommu->external_domain;
> +	struct vfio_group *group;
> +
> +	if (!domain)
> +		domain = list_first_entry(&iommu->domain_list,
> +					  struct vfio_domain, next);
> +
> +	group = list_first_entry(&domain->group_list, struct vfio_group, next);
> +	return vfio_iommu_group_contained(group->iommu_group);
> +}

This seems really backwards for a vfio iommu backend to ask vfio-core
whether its internal object is active.

> +
> +
> +bool vfio_vaddr_valid(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +{
> +	while (dma->suspended) {
> +		mutex_unlock(&iommu->lock);
> +		msleep_interruptible(10);
> +		mutex_lock(&iommu->lock);
> +		if (kthread_should_stop() || !vfio_iommu_contained(iommu) ||
> +		    fatal_signal_pending(current)) {
> +			return false;
> +		}
> +	}
> +	return true;
> +}
> +
>  /*
>   * Attempt to pin pages.  We really don't want to track all the pfns and
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> @@ -690,6 +720,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  			continue;
>  		}
>  
> +		if (!vfio_vaddr_valid(iommu, dma)) {
> +			ret = -EFAULT;
> +			goto pin_unwind;
> +		}

This could have dropped iommu->lock, we have no basis to believe our
vfio_dma object is still valid.  Also this is called with an elevated
reference to the container, so we theoretically should not need to test
whether the iommu is still contained.

> +
>  		remote_vaddr = dma->vaddr + (iova - dma->iova);
>  		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
>  					     do_accounting);
> @@ -1514,12 +1549,16 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  					i += PAGE_SIZE;
>  				}
>  			} else {
> -				unsigned long pfn;
> -				unsigned long vaddr = dma->vaddr +
> -						     (iova - dma->iova);
> +				unsigned long pfn, vaddr;
>  				size_t n = dma->iova + dma->size - iova;
>  				long npage;
>  
> +				if (!vfio_vaddr_valid(iommu, dma)) {
> +					ret = -EFAULT;
> +					goto unwind;
> +				}

This one is even scarier, do we have any basis to assume either object
is still valid after iommu->lock is released?  We can only get here if
we're attaching a group to the iommu with suspended vfio_dmas.  Do we
need to allow that?  It seems we could -EAGAIN and let the user figure
out that they can't attach a group while mappings have invalid vaddrs.

> +				vaddr = dma->vaddr + (iova - dma->iova);
> +
>  				npage = vfio_pin_pages_remote(dma, vaddr,
>  							      n >> PAGE_SHIFT,
>  							      &pfn, limit);
> @@ -2965,6 +3004,9 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  	if (count > dma->size - offset)
>  		count = dma->size - offset;
>  
> +	if (!vfio_vaddr_valid(iommu, dma))

Same as the pinning path, this should hold a container user reference
but we cannot assume our vfio_dma object is valid if we've released the
lock.  Thanks,

Alex

> +		goto out;
> +
>  	vaddr = dma->vaddr + offset;
>  
>  	if (write) {