On Wed, 14 Dec 2022 11:22:48 -0800 Steve Sistare <steven.sistare@xxxxxxxxxx> wrote: > When a vfio container is preserved across exec using the VFIO_UPDATE_VADDR > interfaces, locked_vm of the new mm becomes 0. If the user later unmaps a > dma mapping, locked_vm underflows to a large unsigned value, and a > subsequent dma map request fails with ENOMEM in __account_locked_vm. > > To avoid underflow, do not decrement locked_vm during unmap if the > dma's mm has changed. To restore the correct locked_vm count, when > VFIO_DMA_MAP_FLAG_VADDR is used and the dma's mm has changed, add > the mapping's pinned page count to the new mm->locked_vm, subject > to the rlimit. Now that mediated devices are excluded when using > VFIO_UPDATE_VADDR, the amount of pinned memory equals the size of > the mapping. > > Fixes: c3cbab24db38 ("vfio/type1: implement interfaces to update vaddr") > > Signed-off-by: Steve Sistare <steven.sistare@xxxxxxxxxx> > --- > drivers/vfio/vfio_iommu_type1.c | 49 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 49 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index b04f485..e719c13 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -100,6 +100,7 @@ struct vfio_dma { > struct task_struct *task; > struct rb_root pfn_list; /* Ex-user pinned pfn list */ > unsigned long *bitmap; > + struct mm_struct *mm; > }; > > struct vfio_batch { > @@ -424,6 +425,12 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) > if (!mm) > return -ESRCH; /* process exited */ > > + /* Avoid locked_vm underflow */ > + if (dma->mm != mm && npage < 0) { > + mmput(mm); > + return 0; > + } How about initialize ret = 0 and jump to the existing mmput() with a goto, so there's no assumptions about whether we need the mmput() or not. > + > ret = mmap_write_lock_killable(mm); > if (!ret) { > ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task, > @@ -1180,6 +1187,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) > vfio_unmap_unpin(iommu, dma, true); > vfio_unlink_dma(iommu, dma); > put_task_struct(dma->task); > + mmdrop(dma->mm); > vfio_dma_bitmap_free(dma); > if (dma->vaddr_invalid) { > iommu->vaddr_invalid_count--; > @@ -1578,6 +1586,42 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, > return list_empty(iova); > } > > +static int vfio_change_dma_owner(struct vfio_dma *dma) > +{ > + int ret = 0; > + struct mm_struct *mm = get_task_mm(dma->task); > + > + if (dma->mm != mm) { > + long npage = dma->size >> PAGE_SHIFT; > + bool new_lock_cap = capable(CAP_IPC_LOCK); > + struct task_struct *new_task = current->group_leader; > + > + ret = mmap_write_lock_killable(new_task->mm); > + if (ret) > + goto out; > + > + ret = __account_locked_vm(new_task->mm, npage, true, > + new_task, new_lock_cap); > + mmap_write_unlock(new_task->mm); > + if (ret) > + goto out; > + > + if (dma->task != new_task) { > + vfio_lock_acct(dma, -npage, 0); > + put_task_struct(dma->task); > + dma->task = get_task_struct(new_task); > + } IIUC, we're essentially open coding vfio_lock_acct() in the previous section so that we can be sure we've accounted the new task before we credit the previous task. However, I was under the impression the task remained the same, but the mm changes, which is how we end up with the underflow described in the commit log. What circumstances cause a task change? Thanks, Alex > + mmdrop(dma->mm); > + dma->mm = new_task->mm; > + mmgrab(dma->mm); > + dma->lock_cap = new_lock_cap; > + } > +out: > + if (mm) > + mmput(mm); > + return ret; > +} > + > static int vfio_dma_do_map(struct vfio_iommu *iommu, > struct vfio_iommu_type1_dma_map *map) > { > @@ -1627,6 +1671,9 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, > dma->size != size) { > ret = -EINVAL; > } else { > + ret = vfio_change_dma_owner(dma); > + if (ret) > + goto out_unlock; > dma->vaddr = vaddr; > dma->vaddr_invalid = false; > iommu->vaddr_invalid_count--; > @@ -1687,6 +1734,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, > get_task_struct(current->group_leader); > dma->task = current->group_leader; > dma->lock_cap = capable(CAP_IPC_LOCK); > + dma->mm = dma->task->mm; > + mmgrab(dma->mm); > > dma->pfn_list = RB_ROOT; >