Hi, On 16/05/2017 21:52, Alex Williamson wrote: > commit 0cfef2b7410b64d7a430947e0b533314c4f97153 upstream. > > If the mmap_sem is contented then the vfio type1 IOMMU backend will > defer locked page accounting updates to a workqueue task. This has a > few problems and depending on which side the user tries to play, they > might be over-penalized for unmaps that haven't yet been accounted or > race the workqueue to enter more mappings than they're allowed. The > original intent of this workqueue mechanism seems to be focused on > reducing latency through the ioctl, but we cannot do so at the cost > of correctness. Remove this workqueue mechanism and update the > callers to allow for failure. We can also now recheck the limit under > write lock to make sure we don't exceed it. > > vfio_pin_pages_remote() also now necessarily includes an unwind path > which we can jump to directly if the consecutive page pinning finds > that we're exceeding the user's memory limits. This avoids the > current lazy approach which does accounting and mapping up to the > fault, only to return an error on the next iteration to unwind the > entire vfio_dma. > > Cc: stable@xxxxxxxxxxxxxxx > Reviewed-by: Peter Xu <peterx@xxxxxxxxxx> > Reviewed-by: Kirti Wankhede <kwankhede@xxxxxxxxxx> > Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> Reviewed-by: Eric Auger <eric.auger@xxxxxxxxxx> Eric > --- > > Proposed backport for kernels v4.7..v4.9 (pre-mdev, with down_write_killable) > > drivers/vfio/vfio_iommu_type1.c | 102 +++++++++++++++++---------------------- > 1 file changed, 44 insertions(+), 58 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 2ba19424e4a1..1d48e62f4f52 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -130,57 +130,36 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) > rb_erase(&old->node, &iommu->dma_list); > } > > -struct vwork { > - struct mm_struct *mm; > - long npage; > - struct work_struct work; > -}; > - > -/* delayed decrement/increment for locked_vm */ > -static void vfio_lock_acct_bg(struct work_struct *work) > +static int vfio_lock_acct(long npage, bool *lock_cap) > { > - struct vwork *vwork = container_of(work, struct vwork, work); > - struct mm_struct *mm; > - > - mm = vwork->mm; > - down_write(&mm->mmap_sem); > - mm->locked_vm += vwork->npage; > - up_write(&mm->mmap_sem); > - mmput(mm); > - kfree(vwork); > -} > + int ret; > > -static void vfio_lock_acct(long npage) > -{ > - struct vwork *vwork; > - struct mm_struct *mm; > + if (!npage) > + return 0; > > - if (!current->mm || !npage) > - return; /* process exited or nothing to do */ > + if (!current->mm) > + return -ESRCH; /* process exited */ > + > + ret = down_write_killable(¤t->mm->mmap_sem); > + if (!ret) { > + if (npage > 0) { > + if (lock_cap ? !*lock_cap : !capable(CAP_IPC_LOCK)) { > + unsigned long limit; > + > + limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + > + if (current->mm->locked_vm + npage > limit) > + ret = -ENOMEM; > + } > + } > + > + if (!ret) > + current->mm->locked_vm += npage; > > - if (down_write_trylock(¤t->mm->mmap_sem)) { > - current->mm->locked_vm += npage; > up_write(¤t->mm->mmap_sem); > - return; > } > > - /* > - * Couldn't get mmap_sem lock, so must setup to update > - * mm->locked_vm later. If locked_vm were atomic, we > - * wouldn't need this silliness > - */ > - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); > - if (!vwork) > - return; > - mm = get_task_mm(current); > - if (!mm) { > - kfree(vwork); > - return; > - } > - INIT_WORK(&vwork->work, vfio_lock_acct_bg); > - vwork->mm = mm; > - vwork->npage = npage; > - schedule_work(&vwork->work); > + return ret; > } > > /* > @@ -262,9 +241,9 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) > static long vfio_pin_pages(unsigned long vaddr, long npage, > int prot, unsigned long *pfn_base) > { > - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > bool lock_cap = capable(CAP_IPC_LOCK); > - long ret, i; > + long ret, i = 1; > bool rsvd; > > if (!current->mm) > @@ -283,16 +262,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, > return -ENOMEM; > } > > - if (unlikely(disable_hugepages)) { > - if (!rsvd) > - vfio_lock_acct(1); > - return 1; > - } > + if (unlikely(disable_hugepages)) > + goto out; > > /* Lock all the consecutive pages from pfn_base */ > - for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { > - unsigned long pfn = 0; > - > + for (vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { > ret = vaddr_get_pfn(vaddr, prot, &pfn); > if (ret) > break; > @@ -308,12 +282,24 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, > put_pfn(pfn, prot); > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > __func__, limit << PAGE_SHIFT); > - break; > + ret = -ENOMEM; > + goto unpin_out; > } > } > > +out: > if (!rsvd) > - vfio_lock_acct(i); > + ret = vfio_lock_acct(i, &lock_cap); > + > +unpin_out: > + if (ret) { > + if (!rsvd) { > + for (pfn = *pfn_base ; i ; pfn++, i--) > + put_pfn(pfn, prot); > + } > + > + return ret; > + } > > return i; > } > @@ -328,7 +314,7 @@ static long vfio_unpin_pages(unsigned long pfn, long npage, > unlocked += put_pfn(pfn++, prot); > > if (do_accounting) > - vfio_lock_acct(-unlocked); > + vfio_lock_acct(-unlocked, NULL); > > return unlocked; > } > @@ -390,7 +376,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) > cond_resched(); > } > > - vfio_lock_acct(-unlocked); > + vfio_lock_acct(-unlocked, NULL); > } > > static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) >