vfio_pin_pages() accepts an array of unrelated iova pfns and processes each to return the physical pfn. When dealing with large arrays of contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because it is processed page by page.In this case, we can divide the iova pfn array into multiple continuous ranges and optimize them. For example, when the iova pfn array is {1,5,6,7,9}, it will be divided into three groups {1}, {5,6,7}, {9} for processing. When processing {5,6,7}, the number of calls to pin_user_pages_remote is reduced from 3 times to once. For single page or large array of discontinuous iovas, we still use vfio_pin_page_external to deal with it to reduce the performance loss caused by refactoring. Signed-off-by: Xiaoyang Xu <xuxiaoyang2@xxxxxxxxxx> --- v1 -> v2: * make vfio_iommu_type1_pin_contiguous_pages use vfio_pin_page_external to pin single page when npage=1 * make vfio_pin_contiguous_pages_external use set npage to mark consecutive pages as dirty. simplify the processing logic of unwind * remove unnecessary checks in vfio_get_contiguous_pages_length, put the least costly judgment logic at the top, and replace vfio_iova_get_vfio_pfn with vfio_find_vpfn drivers/vfio/vfio_iommu_type1.c | 231 ++++++++++++++++++++++++++++---- 1 file changed, 204 insertions(+), 27 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 67e827638995..080727b531c6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -628,6 +628,196 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, return unlocked; } +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, + int prot, long npage, unsigned long *phys_pfn) +{ + struct page **pages = NULL; + unsigned int flags = 0; + int i, ret; + + pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (prot & IOMMU_WRITE) + flags |= FOLL_WRITE; + + mmap_read_lock(mm); + ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM, + pages, NULL, NULL); + mmap_read_unlock(mm); + + for (i = 0; i < ret; i++) + *(phys_pfn + i) = page_to_pfn(pages[i]); + + kvfree(pages); + + return ret; +} + +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu, + struct vfio_dma *dma, + unsigned long *user_pfn, + int npage, unsigned long *phys_pfn, + bool do_accounting) +{ + int ret, i, j, lock_acct = 0; + unsigned long remote_vaddr; + dma_addr_t iova; + struct mm_struct *mm; + struct vfio_pfn *vpfn; + + mm = get_task_mm(dma->task); + if (!mm) + return -ENODEV; + + iova = user_pfn[0] << PAGE_SHIFT; + remote_vaddr = dma->vaddr + iova - dma->iova; + ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot, + npage, phys_pfn); + mmput(mm); + if (ret <= 0) + return ret; + + npage = ret; + for (i = 0; i < npage; i++) { + iova = user_pfn[i] << PAGE_SHIFT; + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); + if (ret) + goto unwind; + + if (!is_invalid_reserved_pfn(phys_pfn[i])) + lock_acct++; + } + + if (do_accounting) { + ret = vfio_lock_acct(dma, lock_acct, true); + if (ret) { + if (ret == -ENOMEM) + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, dma->task->comm, task_pid_nr(dma->task), + task_rlimit(dma->task, RLIMIT_MEMLOCK)); + goto unwind; + } + } + + if (iommu->dirty_page_tracking) { + unsigned long pgshift = __ffs(iommu->pgsize_bitmap); + + /* + * Bitmap populated with the smallest supported page + * size + */ + bitmap_set(dma->bitmap, + ((user_pfn[0] << PAGE_SHIFT) - dma->iova) >> pgshift, npage); + } + + return i; +unwind: + for (j = 0; j < npage; j++) { + if (j < i) { + iova = user_pfn[j] << PAGE_SHIFT; + vpfn = vfio_find_vpfn(dma, iova); + vfio_iova_put_vfio_pfn(dma, vpfn); + } else { + put_pfn(phys_pfn[j], dma->prot); + } + + phys_pfn[j] = 0; + } + + return ret; +} + +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu, + struct vfio_dma *dma, + unsigned long *user_pfn, + int npage, unsigned long *phys_pfn, + bool do_accounting) +{ + int ret = 0, i, j; + unsigned long remote_vaddr; + dma_addr_t iova; + + if (npage == 1) + goto pin_single_page; + + ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage, + phys_pfn, do_accounting); + if (ret == npage) + return ret; + + if (ret < 0) + ret = 0; + +pin_single_page: + for (i = ret; i < npage; i++) { + iova = user_pfn[i] << PAGE_SHIFT; + remote_vaddr = dma->vaddr + iova - dma->iova; + + ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i], + do_accounting); + if (ret) + goto pin_unwind; + + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); + if (ret) { + if (put_pfn(phys_pfn[i], dma->prot) && do_accounting) + vfio_lock_acct(dma, -1, true); + goto pin_unwind; + } + + if (iommu->dirty_page_tracking) { + unsigned long pgshift = __ffs(iommu->pgsize_bitmap); + + /* + * Bitmap populated with the smallest supported page + * size + */ + bitmap_set(dma->bitmap, + (iova - dma->iova) >> pgshift, 1); + } + } + + return i; + +pin_unwind: + phys_pfn[i] = 0; + for (j = 0; j < i; j++) { + iova = user_pfn[j] << PAGE_SHIFT; + vfio_unpin_page_external(dma, iova, do_accounting); + phys_pfn[j] = 0; + } + + return ret; +} + +static int vfio_get_contiguous_pages_length(struct vfio_dma *dma, + unsigned long *user_pfn, int npage) +{ + int i; + dma_addr_t iova = user_pfn[0] << PAGE_SHIFT; + struct vfio_pfn *vpfn; + + if (npage <= 1) + return npage; + + for (i = 1; i < npage; i++) { + if (user_pfn[i] != user_pfn[0] + i) + break; + + iova = user_pfn[i] << PAGE_SHIFT; + if (iova >= dma->iova + dma->size || + iova + PAGE_SIZE <= dma->iova) + break; + + vpfn = vfio_find_vpfn(dma, iova); + if (vpfn) + break; + } + return i; +} + static int vfio_iommu_type1_pin_pages(void *iommu_data, struct iommu_group *iommu_group, unsigned long *user_pfn, @@ -637,9 +827,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, struct vfio_iommu *iommu = iommu_data; struct vfio_group *group; int i, j, ret; - unsigned long remote_vaddr; struct vfio_dma *dma; bool do_accounting; + int contiguous_npage; if (!iommu || !user_pfn || !phys_pfn) return -EINVAL; @@ -663,7 +853,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, */ do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); - for (i = 0; i < npage; i++) { + for (i = 0; i < npage; i += contiguous_npage) { dma_addr_t iova; struct vfio_pfn *vpfn; @@ -682,31 +872,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, vpfn = vfio_iova_get_vfio_pfn(dma, iova); if (vpfn) { phys_pfn[i] = vpfn->pfn; - continue; - } - - remote_vaddr = dma->vaddr + (iova - dma->iova); - ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i], - do_accounting); - if (ret) - goto pin_unwind; - - ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); - if (ret) { - if (put_pfn(phys_pfn[i], dma->prot) && do_accounting) - vfio_lock_acct(dma, -1, true); - goto pin_unwind; - } - - if (iommu->dirty_page_tracking) { - unsigned long pgshift = __ffs(iommu->pgsize_bitmap); - - /* - * Bitmap populated with the smallest supported page - * size - */ - bitmap_set(dma->bitmap, - (iova - dma->iova) >> pgshift, 1); + contiguous_npage = 1; + } else { + ret = vfio_get_contiguous_pages_length(dma, + &user_pfn[i], npage - i); + if (ret < 0) + goto pin_unwind; + + ret = vfio_iommu_type1_pin_contiguous_pages(iommu, + dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting); + if (ret < 0) + goto pin_unwind; + contiguous_npage = ret; } } ret = i; -- 2.19.1