Re: [PATCH V2 9/9] vfio/type1: block on invalid vaddr

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 27 Jan 2021 17:03:25 -0700
Alex Williamson <alex.williamson@xxxxxxxxxx> wrote:

> On Wed, 27 Jan 2021 18:25:13 -0500
> Steven Sistare <steven.sistare@xxxxxxxxxx> wrote:
> 
> > On 1/22/2021 5:59 PM, Alex Williamson wrote:  
> > > On Tue, 19 Jan 2021 09:48:29 -0800
> > > Steve Sistare <steven.sistare@xxxxxxxxxx> wrote:
> > >     
> > >> Block translation of host virtual address while an iova range has an
> > >> invalid vaddr.
> > >>
> > >> Signed-off-by: Steve Sistare <steven.sistare@xxxxxxxxxx>
> > >> ---
> > >>  drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
> > >>  1 file changed, 76 insertions(+), 7 deletions(-)
> > >>
> > >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > >> index 0167996..c97573a 100644
> > >> --- a/drivers/vfio/vfio_iommu_type1.c
> > >> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >> @@ -31,6 +31,7 @@
> > >>  #include <linux/rbtree.h>
> > >>  #include <linux/sched/signal.h>
> > >>  #include <linux/sched/mm.h>
> > >> +#include <linux/kthread.h>
> > >>  #include <linux/slab.h>
> > >>  #include <linux/uaccess.h>
> > >>  #include <linux/vfio.h>
> > >> @@ -75,6 +76,7 @@ struct vfio_iommu {
> > >>  	bool			dirty_page_tracking;
> > >>  	bool			pinned_page_dirty_scope;
> > >>  	bool			controlled;
> > >> +	wait_queue_head_t	vaddr_wait;
> > >>  };
> > >>  
> > >>  struct vfio_domain {
> > >> @@ -145,6 +147,8 @@ struct vfio_regions {
> > >>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
> > >>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > >>  
> > >> +#define WAITED 1
> > >> +
> > >>  static int put_pfn(unsigned long pfn, int prot);
> > >>  
> > >>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> > >> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > >>  }
> > >>  
> > >>  /*
> > >> + * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
> > >> + * waits, but is re-locked on return.  If the task waits, then return an updated
> > >> + * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if    
> > > 
> > > s/1/WAITED/    
> > 
> > OK, but the WAITED state will not need to be returned in the new scheme below.
> >   
> > >> + * waited, and -errno on error.
> > >> + */
> > >> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
> > >> +{
> > >> +	struct vfio_dma *dma = *dma_p;
> > >> +	unsigned long iova = dma->iova;
> > >> +	size_t size = dma->size;
> > >> +	int ret = 0;
> > >> +	DEFINE_WAIT(wait);
> > >> +
> > >> +	while (!dma->vaddr_valid) {
> > >> +		ret = WAITED;
> > >> +		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
> > >> +		mutex_unlock(&iommu->lock);
> > >> +		schedule();
> > >> +		mutex_lock(&iommu->lock);
> > >> +		finish_wait(&iommu->vaddr_wait, &wait);
> > >> +		if (kthread_should_stop() || !iommu->controlled ||
> > >> +		    fatal_signal_pending(current)) {
> > >> +			return -EFAULT;
> > >> +		}
> > >> +		*dma_p = dma = vfio_find_dma(iommu, iova, size);
> > >> +		if (!dma)
> > >> +			return -EINVAL;
> > >> +	}
> > >> +	return ret;
> > >> +}
> > >> +
> > >> +/*
> > >> + * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
> > >> + * if the task waits, but is re-locked on return.  Return result in *dma_p.
> > >> + * Return 0 on success, 1 on success if waited,  and -errno on error.
> > >> + */
> > >> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
> > >> +			   size_t size, struct vfio_dma **dma_p)    
> > > 
> > > more of a vfio_find_dma_valid()    
> > 
> > I will slightly modify and rename this with the new scheme I describe below.
> >   
> > >> +{
> > >> +	*dma_p = vfio_find_dma(iommu, start, size);
> > >> +	if (!*dma_p)
> > >> +		return -EINVAL;
> > >> +	return vfio_vaddr_wait(iommu, dma_p);
> > >> +}
> > >> +
> > >> +/*
> > >>   * Attempt to pin pages.  We really don't want to track all the pfns and
> > >>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> > >>   * first page and all consecutive pages with the same locking.
> > >> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >>  		struct vfio_pfn *vpfn;
> > >>  
> > >>  		iova = user_pfn[i] << PAGE_SHIFT;
> > >> -		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> > >> -		if (!dma) {
> > >> -			ret = -EINVAL;
> > >> +		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
> > >> +		if (ret < 0)
> > >>  			goto pin_unwind;
> > >> -		}
> > >> +		else if (ret == WAITED)
> > >> +			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);    
> > > 
> > > We're iterating through an array of pfns to provide translations, once
> > > we've released the lock it's not just the current one that could be
> > > invalid.  I'm afraid we need to unwind and try again, but I think it's
> > > actually worse than that because if we've marked pages within a
> > > vfio_dma's pfn_list as pinned, then during the locking gap it gets
> > > unmapped, the unmap path will call the unmap notifier chain to release
> > > the page that the vendor driver doesn't have yet.  Yikes!    
> > 
> > Yikes indeed.  The fix is easy, though.  I will maintain a count in vfio_iommu of 
> > vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, 
> > provide a function to wait for the count to become 0, and call that function at the 
> > start of vfio_iommu_type1_pin_pages and vfio_iommu_replay.  I will use iommu->vaddr_wait 
> > for wait and wake.  
> 
> I prefer the overhead of this, but the resulting behavior seems pretty
> non-intuitive.  Any invalidated vaddr blocks all vaddr use cases, which
> almost suggests the unmap _VADDR flag should only be allowed with the
> _ALL flag, but then the map _VADDR flag can only be per mapping, which
> would make accounting and recovering from _VADDR + _ALL pretty
> complicated.  Thanks,

I wonder if there's a hybrid approach, a counter on the vfio_iommu
which when non-zero causes pin pages to pre-test vaddr on all required
vfio_dma objects, waiting and being woken on counter decrement to check
again.  Thanks,

Alex

> > >>  		if ((dma->prot & prot) != prot) {
> > >>  			ret = -EPERM;
> > >> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> > >>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > >>  	int ret;
> > >>  
> > >> +	/*
> > >> +	 * Wait for all vaddr to be valid so they can be used in the main loop.
> > >> +	 * If we do wait, the lock was dropped and re-taken, so start over to
> > >> +	 * ensure the dma list is consistent.
> > >> +	 */
> > >> +again:
> > >> +	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> > >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > >> +
> > >> +		ret = vfio_vaddr_wait(iommu, &dma);
> > >> +		if (ret < 0)
> > >> +			return ret;
> > >> +		else if (ret == WAITED)
> > >> +			goto again;
> > >> +	}    
> > > 
> > > This "do nothing until all the vaddrs are valid" approach could work
> > > above too, but gosh is that a lot of cache unfriendly work for a rare
> > > invalidation.  Thanks,    
> > 
> > The new wait function described above is fast in the common case, just a check that
> > the invalid vaddr count is 0.
> > 
> > - Steve
> >   
> > >> +
> > >>  	/* Arbitrarily pick the first domain in the list for lookups */
> > >>  	if (!list_empty(&iommu->domain_list))
> > >>  		d = list_first_entry(&iommu->domain_list,
> > >> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> > >>  	iommu->controlled = true;
> > >>  	mutex_init(&iommu->lock);
> > >>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> > >> +	init_waitqueue_head(&iommu->vaddr_wait);
> > >>  
> > >>  	return iommu;
> > >>  }
> > >> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > >>  	struct vfio_dma *dma;
> > >>  	bool kthread = current->mm == NULL;
> > >>  	size_t offset;
> > >> +	int ret;
> > >>  
> > >>  	*copied = 0;
> > >>  
> > >> -	dma = vfio_find_dma(iommu, user_iova, 1);
> > >> -	if (!dma)
> > >> -		return -EINVAL;
> > >> +	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
> > >> +	if (ret < 0)
> > >> +		return ret;
> > >>  
> > >>  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
> > >>  			!(dma->prot & IOMMU_READ))
> > >> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
> > >>  	mutex_lock(&iommu->lock);
> > >>  	iommu->controlled = false;
> > >>  	mutex_unlock(&iommu->lock);
> > >> +	wake_up_all(&iommu->vaddr_wait);
> > >>  }
> > >>  
> > >>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {    
> > >     
> >   
> 




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux