[PATCH WIP 28/43] IB/core: Introduce new fast registration API

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The new fast registration is receiving a struct
scatterlist and converts it to a page list under
the verbs API. The user is provided with a new
verb ib_map_mr_sg, and a helper to set the send work
request structure.

The drivers are handed with a generic helper that
converts a scatterlist into a vector of pages.
Given that some drivers have a shadow mapped page list,
I expect that drivers might use their own routines to
avoid the extra copies.

The new registration API is added with fast_reg for
now, but once all drivers and ULPs will be ported, we
can drop the old registration API.

Signed-off-by: Sagi Grimberg <sagig@xxxxxxxxxxxx>
---
 drivers/infiniband/core/verbs.c | 123 ++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h         |  37 ++++++++++++
 2 files changed, 160 insertions(+)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index beed431..9875163 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1481,3 +1481,126 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 		mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
 }
 EXPORT_SYMBOL(ib_check_mr_status);
+
+
+/**
+ * ib_map_mr_sg() - Populates MR with a dma mapped SG list
+ * @mr:            memory region
+ * @sg:            dma mapped scatterlist
+ * @sg_nents:      number of entries in sg
+ * @access:        access permissions
+ *
+ * After this completes successfully, the memory region is ready
+ * for fast registration.
+ */
+int ib_map_mr_sg(struct ib_mr *mr,
+		 struct scatterlist *sg,
+		 unsigned short sg_nents,
+		 unsigned int access)
+{
+	int rc;
+
+	if (!mr->device->map_mr_sg)
+		return -ENOSYS;
+
+	rc = mr->device->map_mr_sg(mr, sg, sg_nents);
+	if (!rc)
+		mr->access = access;
+
+	return rc;
+}
+EXPORT_SYMBOL(ib_map_mr_sg);
+
+/**
+ * ib_sg_to_pages() - Convert a sg list to a page vector
+ * @dev:           ib device
+ * @sgl:           dma mapped scatterlist
+ * @sg_nents:      number of entries in sg
+ * @max_pages:     maximum pages allowed
+ * @pages:         output page vector
+ * @npages:        output number of mapped pages
+ * @length:        output total byte length
+ * @offset:        output first byte offset
+ *
+ * Core service helper for drivers to convert a scatter
+ * list to a page vector. The assumption is that the
+ * sg must meet the following conditions:
+ * - Only the first sg is allowed to have an offset
+ * - All the elements are of the same size - PAGE_SIZE
+ * - The last element is allowed to have length less than
+ *   PAGE_SIZE
+ *
+ * If any of those conditions is not met, the routine will
+ * fail with EINVAL.
+ */
+int ib_sg_to_pages(struct scatterlist *sgl,
+		   unsigned short sg_nents,
+		   unsigned short max_pages,
+		   u64 *pages, u32 *npages,
+		   u32 *length, u64 *offset)
+{
+	struct scatterlist *sg;
+	u64 last_end_dma_addr = 0, last_page_addr = 0;
+	unsigned int last_page_off = 0;
+	int i, j = 0;
+
+	/* TODO: We can do better with huge pages */
+
+	*offset = sg_dma_address(&sgl[0]);
+	*length = 0;
+
+	for_each_sg(sgl, sg, sg_nents, i) {
+		u64 dma_addr = sg_dma_address(sg);
+		unsigned int dma_len = sg_dma_len(sg);
+		u64 end_dma_addr = dma_addr + dma_len;
+		u64 page_addr = dma_addr & PAGE_MASK;
+
+		*length += dma_len;
+
+		/* Fail we ran out of pages */
+		if (unlikely(j > max_pages))
+			return -EINVAL;
+
+		if (i && sg->offset) {
+			if (unlikely((last_end_dma_addr) != dma_addr)) {
+				/* gap - fail */
+				goto err;
+			}
+			if (last_page_off + dma_len < PAGE_SIZE) {
+				/* chunk this fragment with the last */
+				last_end_dma_addr += dma_len;
+				last_page_off += dma_len;
+				continue;
+			} else {
+				/* map starting from the next page */
+				page_addr = last_page_addr + PAGE_SIZE;
+				dma_len -= PAGE_SIZE - last_page_off;
+			}
+		}
+
+		do {
+			pages[j++] = page_addr;
+			page_addr += PAGE_SIZE;
+		} while (page_addr < end_dma_addr);
+
+		last_end_dma_addr = end_dma_addr;
+		last_page_addr = end_dma_addr & PAGE_MASK;
+		last_page_off = end_dma_addr & ~PAGE_MASK;
+	}
+
+	*npages = j;
+
+	return 0;
+err:
+	pr_err("RDMA alignment violation\n");
+	for_each_sg(sgl, sg, sg_nents, i) {
+		u64 dma_addr = sg_dma_address(sg);
+		unsigned int dma_len = sg_dma_len(sg);
+
+		pr_err("sg[%d]: offset=0x%x, dma_addr=0x%llx, dma_len=0x%x\n",
+			i, sg->offset, dma_addr, dma_len);
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ib_sg_to_pages);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7a93e2d..d543fee 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1013,6 +1013,7 @@ enum ib_wr_opcode {
 	IB_WR_RDMA_READ_WITH_INV,
 	IB_WR_LOCAL_INV,
 	IB_WR_FAST_REG_MR,
+	IB_WR_FASTREG_MR,
 	IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
 	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
 	IB_WR_BIND_MW,
@@ -1117,6 +1118,10 @@ struct ib_send_wr {
 			u32				rkey;
 		} fast_reg;
 		struct {
+			struct ib_mr *mr;
+			u32          key;
+		} fastreg;
+		struct {
 			struct ib_mw            *mw;
 			/* The new rkey for the memory window. */
 			u32                      rkey;
@@ -1316,6 +1321,9 @@ struct ib_mr {
 	struct ib_uobject *uobject;
 	u32		   lkey;
 	u32		   rkey;
+	int		   access;
+	u64		   iova;
+	u32		   length;
 	atomic_t	   usecnt; /* count number of MWs */
 };
 
@@ -1661,6 +1669,9 @@ struct ib_device {
 					       enum ib_mr_type mr_type,
 					       u32 max_entries,
 					       u32 flags);
+	int                        (*map_mr_sg)(struct ib_mr *mr,
+						struct scatterlist *sg,
+						unsigned short sg_nents);
 	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
 								   int page_list_len);
 	void			   (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
@@ -2991,4 +3002,30 @@ static inline int ib_check_mr_access(int flags)
 int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 		       struct ib_mr_status *mr_status);
 
+int ib_map_mr_sg(struct ib_mr *mr,
+		 struct scatterlist *sg,
+		 unsigned short sg_nents,
+		 unsigned int access);
+
+int ib_sg_to_pages(struct scatterlist *sgl,
+		   unsigned short sg_nents,
+		   unsigned short max_pages,
+		   u64 *pages, u32 *npages,
+		   u32 *length, u64 *offset);
+
+static inline void
+ib_set_fastreg_wr(struct ib_mr *mr,
+		  u32 key,
+		  uintptr_t wr_id,
+		  bool signaled,
+		  struct ib_send_wr *wr)
+{
+	wr->opcode = IB_WR_FASTREG_MR;
+	wr->wr_id = wr_id;
+	wr->send_flags = signaled ? IB_SEND_SIGNALED : 0;
+	wr->num_sge = 0;
+	wr->wr.fastreg.mr = mr;
+	wr->wr.fastreg.key = key;
+}
+
 #endif /* IB_VERBS_H */
-- 
1.8.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux