Combine contiguous regions of PAGE_SIZE pages into single scatter list entries while adding to the scatter table. This minimizes the number of the entries in the scatter list and reduces the DMA mapping overhead, particularly with the IOMMU. Also, purge npages in struct ib_umem as we now DMA map the umem SGL with sg_nents, and update remaining non ODP drivers that use umem->npages. Move npages tracking to ib_umem_odp as ODP drivers still need it. This patch should be applied post https://patchwork.kernel.org/cover/10857607/ Suggested-by: Jason Gunthorpe <jgg@xxxxxxxx> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@xxxxxxxxx> Reviewed-by: Ira Weiny <ira.weiny@xxxxxxxxx> Signed-off-by: Shiraz Saleem <shiraz.saleem@xxxxxxxxx> --- This patch is decoupled from series https://patchwork.kernel.org/patch/10819993/ as it can be standalone. Changes since v1: * made ib_umem_add_sg_table() static * simplified bool variable 'first' initialization v2->v3: * Added a comment on calculation for checking page_list contiguity in ib_umem_add_sg_table() * Moved iterator to body of for loop in ib_umem_add_sg_table() * Removed explicit update to sg->length prior to calling sg_set_page() * Removed white space clean-up in ib_umem struct * Purge npages from ib_umem struct and updated remaining non ODP drivers to use ib_umem_num_pages() * Add npages tracking in ib_umem_odp as ODP drivers still need it. drivers/infiniband/core/umem.c | 96 ++++++++++++++++++++++------ drivers/infiniband/core/umem_odp.c | 4 +- drivers/infiniband/hw/mlx5/odp.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 11 ++-- include/rdma/ib_umem.h | 2 +- include/rdma/ib_umem_odp.h | 1 + 6 files changed, 87 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fe55515..17ebad9 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,25 +39,22 @@ #include <linux/export.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" - static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct scatterlist *sg; + struct sg_page_iter sg_iter; struct page *page; - int i; if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->npages, + ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - - page = sg_page(sg); + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { + page = sg_page_iter_page(&sg_iter); if (!PageDirty(page) && umem->writable && dirty) set_page_dirty_lock(page); put_page(page); @@ -66,6 +63,66 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d sg_free_table(&umem->sg_head); } +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table + * + * sg: current scatterlist entry + * page_list: array of npage struct page pointers + * npages: number of pages in page_list + * nents: [out] number of entries in the scatterlist + * + * Return new end of scatterlist + */ +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, + struct page **page_list, + unsigned long npages, + int *nents) +{ + unsigned long first_pfn; + unsigned long i = 0; + bool update_cur_sg = false; + bool first = !sg_page(sg); + + /* Check if new page_list is contiguous with end of previous page_list. + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. + */ + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == + page_to_pfn(page_list[0]))) + update_cur_sg = true; + + while (i != npages) { + unsigned long len; + struct page *first_page = page_list[i]; + + first_pfn = page_to_pfn(first_page); + + /* Compute the number of contiguous pages we have starting + * at i + */ + for (len = 0; i != npages && + first_pfn + len == page_to_pfn(page_list[i]); + len++) + i++; + + /* Squash N contiguous pages from page_list into current sge */ + if (update_cur_sg) { + sg_set_page(sg, sg_page(sg), + sg->length + (len << PAGE_SHIFT), 0); + update_cur_sg = false; + continue; + } + + /* Squash N contiguous pages into next sge or first sge */ + if (!first) + sg = sg_next(sg); + + (*nents)++; + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); + first = false; + } + + return sg; +} + /** * ib_umem_get - Pin and DMA map userspace memory. * @@ -93,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, int ret; int i; unsigned long dma_attrs = 0; - struct scatterlist *sg, *sg_list_start; + struct scatterlist *sg; unsigned int gup_flags = FOLL_WRITE; if (!udata) @@ -185,7 +242,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!umem->writable) gup_flags |= FOLL_FORCE; - sg_list_start = umem->sg_head.sgl; + sg = umem->sg_head.sgl; while (npages) { down_read(&mm->mmap_sem); @@ -198,28 +255,27 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, goto umem_release; } - umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; + sg = ib_umem_add_sg_table(sg, page_list, ret, &umem->sg_nents); + /* Continue to hold the mmap_sem as vma_list access * needs to be protected. */ - for_each_sg(sg_list_start, sg, ret, i) { + for (i = 0; i < ret && umem->hugetlb; i++) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; - - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } - up_read(&mm->mmap_sem); - /* preparing for next loop */ - sg_list_start = sg; + up_read(&mm->mmap_sem); } + sg_mark_end(sg); + umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, - umem->npages, + umem->sg_nents, DMA_BIDIRECTIONAL, dma_attrs); @@ -315,8 +371,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, - offset + ib_umem_offset(umem)); + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem), + dst, length, offset + ib_umem_offset(umem)); if (ret < 0) return ret; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e6ec79a..e2d87c4 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,7 +527,7 @@ static int ib_umem_odp_map_dma_single_page( } umem_odp->dma_list[page_index] = dma_addr | access_mask; umem_odp->page_list[page_index] = page; - umem->npages++; + umem_odp->npages++; stored_page = 1; } else if (umem_odp->page_list[page_index] == page) { umem_odp->dma_list[page_index] |= access_mask; @@ -759,7 +759,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, put_page(page); umem_odp->page_list[idx] = NULL; umem_odp->dma_list[idx] = 0; - umem->npages--; + umem_odp->npages--; } } mutex_unlock(&umem_odp->umem_mutex); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index c20bfc4..0c0dfa4 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ib_umem_odp_unmap_dma_pages(umem_odp, start, end); - if (unlikely(!umem->npages && mr->parent && + if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index a85884e..83167fa 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_mr *cmd = &req.create_mr; struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; - int ret; + int ret, npages; if (length == 0 || length > dev->dsr->caps.max_mr_size) { dev_warn(&dev->pdev->dev, "invalid mem region length\n"); @@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(umem); } - if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { + npages = ib_umem_num_pages(umem); + if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) { dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", - umem->npages); + npages); ret = -EINVAL; goto err_umem; } @@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mmr.size = length; mr->umem = umem; - ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); + ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false); if (ret) { dev_warn(&dev->pdev->dev, "could not allocate page directory\n"); @@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, cmd->length = length; cmd->pd_handle = to_vpd(pd)->pd_handle; cmd->access_flags = access_flags; - cmd->nchunks = umem->npages; + cmd->nchunks = npages; cmd->pdir_dma = mr->pdir.dir_dma; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 73af05d..ccc44c0 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -53,7 +53,7 @@ struct ib_umem { struct work_struct work; struct sg_table sg_head; int nmap; - int npages; + int sg_nents; }; /* Returns the offset of the umem start relative to the first page. */ diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index dadc96d..eeec4e5 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -69,6 +69,7 @@ struct ib_umem_odp { int notifiers_seq; int notifiers_count; + int npages; /* Tree tracking */ struct umem_odp_node interval_tree; -- 1.8.3.1