[PATCH v5 rdma-next] RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Combine contiguous regions of PAGE_SIZE pages
into single scatter list entries while adding
to the scatter table. This minimizes the number
of the entries in the scatter list and reduces
the DMA mapping overhead, particularly with the
IOMMU.

Set default max_seg_size in core for IB devices
to 2G and do not combine if we exceed this limit.

Also, purge npages in struct ib_umem as we now
DMA map the umem SGL with sg_nents, and update
remaining non ODP drivers that use umem->npages.
Move npages tracking to ib_umem_odp as ODP drivers
still need it.

This patch should be applied post
https://patchwork.kernel.org/cover/10857607/

Suggested-by: Jason Gunthorpe <jgg@xxxxxxxx>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@xxxxxxxxx>
Reviewed-by: Ira Weiny <ira.weiny@xxxxxxxxx>
Acked-by: Adit Ranadive <aditr@xxxxxxxxxx>
Signed-off-by: Shiraz Saleem <shiraz.saleem@xxxxxxxxx>
---
This patch is decoupled from series https://patchwork.kernel.org/patch/10819993/
as it can be standalone.
Changes since v1:
* made ib_umem_add_sg_table() static
* simplified bool variable 'first' initialization

v2->v3:
* Added a comment on calculation for checking page_list
contiguity in ib_umem_add_sg_table()
* Moved iterator to body of for loop in ib_umem_add_sg_table()
* Removed explicit update to sg->length prior to calling sg_set_page()
* Removed white space clean-up in ib_umem struct
* Purge npages from ib_umem struct and updated remaining non ODP
drivers to use ib_umem_num_pages()
* Add npages tracking in ib_umem_odp as ODP drivers still need it.

v3->v4:
*Set default max_seg_size for all IB devices to 2G and
avoid combining if we exceed this limit.

v4->v5:
*updated function header for ib_umem_add_sg_table()
*updated sg_nents to unsigned int
*fixed max seg size limits check in ib_umem_add_sg_table()

 drivers/infiniband/core/device.c             |   3 +
 drivers/infiniband/core/umem.c               | 101 +++++++++++++++++++++------
 drivers/infiniband/core/umem_odp.c           |   4 +-
 drivers/infiniband/hw/mlx5/odp.c             |   2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c |  11 +--
 include/rdma/ib_umem.h                       |   2 +-
 include/rdma/ib_umem_odp.h                   |   1 +
 7 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7421ec4..5a1bdb9 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -711,6 +711,9 @@ static void setup_dma_device(struct ib_device *device)
 		WARN_ON_ONCE(!parent);
 		device->dma_device = parent;
 	}
+	/* Setup default max segment size for all IB devices */
+	dma_set_max_seg_size(device->dma_device, SZ_2G);
+
 }
 
 /*
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 89a7d57..d31f5e3 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -39,25 +39,22 @@
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
-
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct page *page;
-	int i;
 
 	if (umem->nmap > 0)
-		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-				umem->npages,
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
 				DMA_BIDIRECTIONAL);
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-		page = sg_page(sg);
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
+		page = sg_page_iter_page(&sg_iter);
 		if (!PageDirty(page) && umem->writable && dirty)
 			set_page_dirty_lock(page);
 		put_page(page);
@@ -66,6 +63,69 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 	sg_free_table(&umem->sg_head);
 }
 
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * max_seg_sz: maximum segment size in bytes
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+						struct page **page_list,
+						unsigned long npages,
+						unsigned int max_seg_sz,
+						int *nents)
+{
+	unsigned long first_pfn;
+	unsigned long i = 0;
+	bool update_cur_sg = false;
+	bool first = !sg_page(sg);
+
+	/* Check if new page_list is contiguous with end of previous page_list.
+	 * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
+	 */
+	if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+		       page_to_pfn(page_list[0])))
+		update_cur_sg = true;
+
+	while (i != npages) {
+		unsigned long len;
+		struct page *first_page = page_list[i];
+
+		first_pfn = page_to_pfn(first_page);
+
+		/* Compute the number of contiguous pages we have starting
+		 * at i
+		 */
+		for (len = 0; i != npages &&
+			      first_pfn + len == page_to_pfn(page_list[i]);
+		     len++)
+			i++;
+
+		/* Squash N contiguous pages from page_list into current sge */
+		if (update_cur_sg &&
+		    ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT))) {
+			sg_set_page(sg, sg_page(sg),
+				    sg->length + (len << PAGE_SHIFT), 0);
+			update_cur_sg = false;
+			continue;
+		}
+
+		/* Squash N contiguous pages into next sge or first sge */
+		if (!first)
+			sg = sg_next(sg);
+
+		(*nents)++;
+		sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
+		first = false;
+	}
+
+	return sg;
+}
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
@@ -93,7 +153,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	int ret;
 	int i;
 	unsigned long dma_attrs = 0;
-	struct scatterlist *sg, *sg_list_start;
+	struct scatterlist *sg;
 	unsigned int gup_flags = FOLL_WRITE;
 
 	if (!udata)
@@ -190,7 +250,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	if (!umem->writable)
 		gup_flags |= FOLL_FORCE;
 
-	sg_list_start = umem->sg_head.sgl;
+	sg = umem->sg_head.sgl;
 
 	while (npages) {
 		down_read(&mm->mmap_sem);
@@ -203,28 +263,29 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 			goto umem_release;
 		}
 
-		umem->npages += ret;
 		cur_base += ret * PAGE_SIZE;
 		npages   -= ret;
 
+		sg = ib_umem_add_sg_table(sg, page_list, ret,
+			dma_get_max_seg_size(context->device->dma_device),
+			&umem->sg_nents);
+
 		/* Continue to hold the mmap_sem as vma_list access
 		 * needs to be protected.
 		 */
-		for_each_sg(sg_list_start, sg, ret, i) {
+		for (i = 0; i < ret && umem->hugetlb; i++) {
 			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
 				umem->hugetlb = 0;
-
-			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
 		}
-		up_read(&mm->mmap_sem);
 
-		/* preparing for next loop */
-		sg_list_start = sg;
+		up_read(&mm->mmap_sem);
 	}
 
+	sg_mark_end(sg);
+
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 				  umem->sg_head.sgl,
-				  umem->npages,
+				  umem->sg_nents,
 				  DMA_BIDIRECTIONAL,
 				  dma_attrs);
 
@@ -320,8 +381,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		return -EINVAL;
 	}
 
-	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
-				 offset + ib_umem_offset(umem));
+	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem),
+				 dst, length, offset + ib_umem_offset(umem));
 
 	if (ret < 0)
 		return ret;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 6f8c36f..9721914 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -526,7 +526,7 @@ static int ib_umem_odp_map_dma_single_page(
 		}
 		umem_odp->dma_list[page_index] = dma_addr | access_mask;
 		umem_odp->page_list[page_index] = page;
-		umem->npages++;
+		umem_odp->npages++;
 	} else if (umem_odp->page_list[page_index] == page) {
 		umem_odp->dma_list[page_index] |= access_mask;
 	} else {
@@ -752,7 +752,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 			}
 			umem_odp->page_list[idx] = NULL;
 			umem_odp->dma_list[idx] = 0;
-			umem->npages--;
+			umem_odp->npages--;
 		}
 	}
 	mutex_unlock(&umem_odp->umem_mutex);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 2bc4d67..042ea5c 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 
 	ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
 
-	if (unlikely(!umem->npages && mr->parent &&
+	if (unlikely(!umem_odp->npages && mr->parent &&
 		     !umem_odp->dying)) {
 		WRITE_ONCE(umem_odp->dying, 1);
 		atomic_inc(&mr->parent->num_leaf_free);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index a85884e..83167fa 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	union pvrdma_cmd_resp rsp;
 	struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
 	struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
-	int ret;
+	int ret, npages;
 
 	if (length == 0 || length > dev->dsr->caps.max_mr_size) {
 		dev_warn(&dev->pdev->dev, "invalid mem region length\n");
@@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		return ERR_CAST(umem);
 	}
 
-	if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+	npages = ib_umem_num_pages(umem);
+	if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
 		dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
-			 umem->npages);
+			 npages);
 		ret = -EINVAL;
 		goto err_umem;
 	}
@@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	mr->mmr.size = length;
 	mr->umem = umem;
 
-	ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false);
+	ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false);
 	if (ret) {
 		dev_warn(&dev->pdev->dev,
 			 "could not allocate page directory\n");
@@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	cmd->length = length;
 	cmd->pd_handle = to_vpd(pd)->pd_handle;
 	cmd->access_flags = access_flags;
-	cmd->nchunks = umem->npages;
+	cmd->nchunks = npages;
 	cmd->pdir_dma = mr->pdir.dir_dma;
 
 	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 73af05d..b13a2e9 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -53,7 +53,7 @@ struct ib_umem {
 	struct work_struct	work;
 	struct sg_table sg_head;
 	int             nmap;
-	int             npages;
+	unsigned int    sg_nents;
 };
 
 /* Returns the offset of the umem start relative to the first page. */
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index dadc96d..eeec4e5 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -69,6 +69,7 @@ struct ib_umem_odp {
 
 	int notifiers_seq;
 	int notifiers_count;
+	int npages;
 
 	/* Tree tracking */
 	struct umem_odp_node	interval_tree;
-- 
1.8.3.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux