Re: [PATCH v4.9] vfio/type1: Remove locked page accounting workqueue

Auger Eric <eric.auger@xxxxxxxxxx> · Mon, 22 May 2017 10:15:43 +0200



Hi,

On 16/05/2017 21:52, Alex Williamson wrote:
> commit 0cfef2b7410b64d7a430947e0b533314c4f97153 upstream.
> 
> If the mmap_sem is contented then the vfio type1 IOMMU backend will
> defer locked page accounting updates to a workqueue task.  This has a
> few problems and depending on which side the user tries to play, they
> might be over-penalized for unmaps that haven't yet been accounted or
> race the workqueue to enter more mappings than they're allowed.  The
> original intent of this workqueue mechanism seems to be focused on
> reducing latency through the ioctl, but we cannot do so at the cost
> of correctness.  Remove this workqueue mechanism and update the
> callers to allow for failure.  We can also now recheck the limit under
> write lock to make sure we don't exceed it.
> 
> vfio_pin_pages_remote() also now necessarily includes an unwind path
> which we can jump to directly if the consecutive page pinning finds
> that we're exceeding the user's memory limits.  This avoids the
> current lazy approach which does accounting and mapping up to the
> fault, only to return an error on the next iteration to unwind the
> entire vfio_dma.
> 
> Cc: stable@xxxxxxxxxxxxxxx
> Reviewed-by: Peter Xu <peterx@xxxxxxxxxx>
> Reviewed-by: Kirti Wankhede <kwankhede@xxxxxxxxxx>
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
Reviewed-by: Eric Auger <eric.auger@xxxxxxxxxx>

Eric
> ---
> 
> Proposed backport for kernels v4.7..v4.9 (pre-mdev, with down_write_killable)
> 
>  drivers/vfio/vfio_iommu_type1.c |  102 +++++++++++++++++----------------------
>  1 file changed, 44 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..1d48e62f4f52 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -130,57 +130,36 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> -struct vwork {
> -	struct mm_struct	*mm;
> -	long			npage;
> -	struct work_struct	work;
> -};
> -
> -/* delayed decrement/increment for locked_vm */
> -static void vfio_lock_acct_bg(struct work_struct *work)
> +static int vfio_lock_acct(long npage, bool *lock_cap)
>  {
> -	struct vwork *vwork = container_of(work, struct vwork, work);
> -	struct mm_struct *mm;
> -
> -	mm = vwork->mm;
> -	down_write(&mm->mmap_sem);
> -	mm->locked_vm += vwork->npage;
> -	up_write(&mm->mmap_sem);
> -	mmput(mm);
> -	kfree(vwork);
> -}
> +	int ret;
>  
> -static void vfio_lock_acct(long npage)
> -{
> -	struct vwork *vwork;
> -	struct mm_struct *mm;
> +	if (!npage)
> +		return 0;
>  
> -	if (!current->mm || !npage)
> -		return; /* process exited or nothing to do */
> +	if (!current->mm)
> +		return -ESRCH; /* process exited */
> +
> +	ret = down_write_killable(&current->mm->mmap_sem);
> +	if (!ret) {
> +		if (npage > 0) {
> +			if (lock_cap ? !*lock_cap : !capable(CAP_IPC_LOCK)) {
> +				unsigned long limit;
> +
> +				limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +				if (current->mm->locked_vm + npage > limit)
> +					ret = -ENOMEM;
> +			}
> +		}
> +
> +		if (!ret)
> +			current->mm->locked_vm += npage;
>  
> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> -		current->mm->locked_vm += npage;
>  		up_write(&current->mm->mmap_sem);
> -		return;
>  	}
>  
> -	/*
> -	 * Couldn't get mmap_sem lock, so must setup to update
> -	 * mm->locked_vm later. If locked_vm were atomic, we
> -	 * wouldn't need this silliness
> -	 */
> -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> -	if (!vwork)
> -		return;
> -	mm = get_task_mm(current);
> -	if (!mm) {
> -		kfree(vwork);
> -		return;
> -	}
> -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> -	vwork->mm = mm;
> -	vwork->npage = npage;
> -	schedule_work(&vwork->work);
> +	return ret;
>  }
>  
>  /*
> @@ -262,9 +241,9 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  static long vfio_pin_pages(unsigned long vaddr, long npage,
>  			   int prot, unsigned long *pfn_base)
>  {
> -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> -	long ret, i;
> +	long ret, i = 1;
>  	bool rsvd;
>  
>  	if (!current->mm)
> @@ -283,16 +262,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  		return -ENOMEM;
>  	}
>  
> -	if (unlikely(disable_hugepages)) {
> -		if (!rsvd)
> -			vfio_lock_acct(1);
> -		return 1;
> -	}
> +	if (unlikely(disable_hugepages))
> +		goto out;
>  
>  	/* Lock all the consecutive pages from pfn_base */
> -	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
> -		unsigned long pfn = 0;
> -
> +	for (vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		ret = vaddr_get_pfn(vaddr, prot, &pfn);
>  		if (ret)
>  			break;
> @@ -308,12 +282,24 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> -			break;
> +			ret = -ENOMEM;
> +			goto unpin_out;
>  		}
>  	}
>  
> +out:
>  	if (!rsvd)
> -		vfio_lock_acct(i);
> +		ret = vfio_lock_acct(i, &lock_cap);
> +
> +unpin_out:
> +	if (ret) {
> +		if (!rsvd) {
> +			for (pfn = *pfn_base ; i ; pfn++, i--)
> +				put_pfn(pfn, prot);
> +		}
> +
> +		return ret;
> +	}
>  
>  	return i;
>  }
> @@ -328,7 +314,7 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
> -		vfio_lock_acct(-unlocked);
> +		vfio_lock_acct(-unlocked, NULL);
>  
>  	return unlocked;
>  }
> @@ -390,7 +376,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(-unlocked);
> +	vfio_lock_acct(-unlocked, NULL);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>