Currently our MTT only support inline mtt entries (0 level MTT) and indirect MTT entries (1 level mtt), which will limit the maximum length of MRs. In order to implement a multi-level MTT, we refactor the structure of MTT first. Signed-off-by: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> --- drivers/infiniband/hw/erdma/erdma_hw.h | 4 +- drivers/infiniband/hw/erdma/erdma_qp.c | 2 +- drivers/infiniband/hw/erdma/erdma_verbs.c | 214 +++++++++++++--------- drivers/infiniband/hw/erdma/erdma_verbs.h | 26 ++- 4 files changed, 152 insertions(+), 94 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index a882b57aa118..80a78569bc2a 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -228,7 +228,7 @@ struct erdma_cmdq_ext_db_req { /* create_cq cfg1 */ #define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) -#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) +#define ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK BIT(15) #define ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK BIT(11) #define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) @@ -258,7 +258,7 @@ struct erdma_cmdq_create_cq_req { /* regmr cfg2 */ #define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) -#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20) +#define ERDMA_CMD_REGMR_MTT_LEVEL_MASK GENMASK(21, 20) #define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0) struct erdma_cmdq_reg_mr_req { diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 44923c51a01b..6d0330badd68 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -410,7 +410,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, /* Copy SGLs to SQE content to accelerate */ memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, qp->attrs.sq_size, SQEBB_SHIFT), - mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents)); + mr->mem.mtt->buf, MTT_SIZE(mr->mem.mtt_nents)); wqe_size = sizeof(struct erdma_reg_mr_sqe) + MTT_SIZE(mr->mem.mtt_nents); } else { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index fbbd046b350c..0d272f18256a 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -19,6 +19,23 @@ #include "erdma_cm.h" #include "erdma_verbs.h" +static void assemble_qbuf_mtt_for_cmd(struct erdma_mem *mem, u32 *cfg, + u64 *addr0, u64 *addr1) +{ + struct erdma_mtt *mtt = mem->mtt; + + if (mem->mtt_nents > ERDMA_MAX_INLINE_MTT_ENTRIES) { + *addr0 = mtt->buf_dma; + *cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + ERDMA_MR_INDIRECT_MTT); + } else { + *addr0 = mtt->buf[0]; + memcpy(addr1, mtt->buf + 1, MTT_SIZE(mem->mtt_nents - 1)); + *cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + } +} + static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) { struct erdma_dev *dev = to_edev(qp->ibqp.device); @@ -79,18 +96,16 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) req.sq_mtt_cfg = user_qp->sq_mem.page_offset; req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, - user_qp->sq_mem.mtt_nents) | - FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, - user_qp->sq_mem.mtt_type); + user_qp->sq_mem.mtt_nents); req.rq_mtt_cfg = user_qp->rq_mem.page_offset; req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, - user_qp->rq_mem.mtt_nents) | - FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, - user_qp->rq_mem.mtt_type); + user_qp->rq_mem.mtt_nents); - req.sq_buf_addr = user_qp->sq_mem.mtt_entry[0]; - req.rq_buf_addr = user_qp->rq_mem.mtt_entry[0]; + assemble_qbuf_mtt_for_cmd(&user_qp->sq_mem, &req.sq_mtt_cfg, + &req.sq_buf_addr, req.sq_mtt_entry); + assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg, + &req.rq_buf_addr, req.rq_mtt_entry); req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; @@ -117,13 +132,22 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) { - struct erdma_cmdq_reg_mr_req req; struct erdma_pd *pd = to_epd(mr->ibmr.pd); - u64 *phy_addr; - int i; + struct erdma_cmdq_reg_mr_req req; + u32 mtt_level; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR); + if (mr->type == ERDMA_MR_TYPE_FRMR || + mr->mem.page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES) { + req.phy_addr[0] = mr->mem.mtt->buf_dma; + mtt_level = ERDMA_MR_INDIRECT_MTT; + } else { + memcpy(req.phy_addr, mr->mem.mtt->buf, + MTT_SIZE(mr->mem.page_cnt)); + mtt_level = ERDMA_MR_INLINE_MTT; + } + req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) | FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) | FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); @@ -132,7 +156,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access); req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, ilog2(mr->mem.page_size)) | - FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_LEVEL_MASK, mtt_level) | FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt); if (mr->type == ERDMA_MR_TYPE_DMA) @@ -143,16 +167,6 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) req.size = mr->mem.len; } - if (mr->type == ERDMA_MR_TYPE_FRMR || - mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) { - phy_addr = req.phy_addr; - *phy_addr = mr->mem.mtt_entry[0]; - } else { - phy_addr = req.phy_addr; - for (i = 0; i < mr->mem.mtt_nents; i++) - *phy_addr++ = mr->mem.mtt_entry[i]; - } - post_cmd: return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); } @@ -179,7 +193,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr); req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) | - FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK, ERDMA_MR_INLINE_MTT); req.first_page_offset = 0; @@ -191,16 +205,20 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, ilog2(mem->page_size) - ERDMA_HW_PAGE_SHIFT); if (mem->mtt_nents == 1) { - req.qbuf_addr_l = lower_32_bits(*(u64 *)mem->mtt_buf); - req.qbuf_addr_h = upper_32_bits(*(u64 *)mem->mtt_buf); + req.qbuf_addr_l = lower_32_bits(mem->mtt->buf[0]); + req.qbuf_addr_h = upper_32_bits(mem->mtt->buf[0]); + req.cfg1 |= + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK, + ERDMA_MR_INLINE_MTT); } else { - req.qbuf_addr_l = lower_32_bits(mem->mtt_entry[0]); - req.qbuf_addr_h = upper_32_bits(mem->mtt_entry[0]); + req.qbuf_addr_l = lower_32_bits(mem->mtt->buf_dma); + req.qbuf_addr_h = upper_32_bits(mem->mtt->buf_dma); + req.cfg1 |= + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK, + ERDMA_MR_INDIRECT_MTT); } req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, mem->mtt_nents); - req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, - mem->mtt_type); req.first_page_offset = mem->page_offset; req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; @@ -508,12 +526,77 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, return -ENOMEM; } +static void erdma_fill_bottom_mtt(struct erdma_dev *dev, struct erdma_mem *mem) +{ + struct erdma_mtt *mtt = mem->mtt; + struct ib_block_iter biter; + u32 idx = 0; + + while (mtt->low_level) + mtt = mtt->low_level; + + rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) + mtt->buf[idx++] = rdma_block_iter_dma_address(&biter); +} + +static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev, + size_t size) +{ + struct erdma_mtt *mtt; + int ret = -ENOMEM; + + mtt = kzalloc(sizeof(*mtt), GFP_KERNEL); + if (!mtt) + return ERR_PTR(-ENOMEM); + + mtt->size = size; + mtt->buf = kzalloc(mtt->size, GFP_KERNEL); + if (!mtt->buf) + goto err_free_mtt; + + mtt->continuous = true; + mtt->buf_dma = dma_map_single(&dev->pdev->dev, mtt->buf, mtt->size, + DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mtt->buf_dma)) + goto err_free_mtt_buf; + + return mtt; + +err_free_mtt_buf: + kfree(mtt->buf); + +err_free_mtt: + kfree(mtt); + + return ERR_PTR(ret); +} + +static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size, + bool force_continuous) +{ + ibdev_dbg(&dev->ibdev, "create_mtt, size:%lu, force cont:%d\n", size, + force_continuous); + + if (force_continuous) + return erdma_create_cont_mtt(dev, size); + + return ERR_PTR(-ENOTSUPP); +} + +static void erdma_destroy_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt) +{ + if (mtt->continuous) { + dma_unmap_single(&dev->pdev->dev, mtt->buf_dma, mtt->size, + DMA_TO_DEVICE); + kfree(mtt->buf); + kfree(mtt); + } +} + static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, u64 start, u64 len, int access, u64 virt, unsigned long req_page_size, u8 force_indirect_mtt) { - struct ib_block_iter biter; - uint64_t *phy_addr = NULL; int ret = 0; mem->umem = ib_umem_get(&dev->ibdev, start, len, access); @@ -529,38 +612,13 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, mem->page_offset = start & (mem->page_size - 1); mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size); mem->page_cnt = mem->mtt_nents; - - if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES || - force_indirect_mtt) { - mem->mtt_type = ERDMA_MR_INDIRECT_MTT; - mem->mtt_buf = - alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL); - if (!mem->mtt_buf) { - ret = -ENOMEM; - goto error_ret; - } - phy_addr = mem->mtt_buf; - } else { - mem->mtt_type = ERDMA_MR_INLINE_MTT; - phy_addr = mem->mtt_entry; - } - - rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) { - *phy_addr = rdma_block_iter_dma_address(&biter); - phy_addr++; + mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt), true); + if (IS_ERR(mem->mtt)) { + ret = PTR_ERR(mem->mtt); + goto error_ret; } - if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) { - mem->mtt_entry[0] = - dma_map_single(&dev->pdev->dev, mem->mtt_buf, - MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); - if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) { - free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); - mem->mtt_buf = NULL; - ret = -ENOMEM; - goto error_ret; - } - } + erdma_fill_bottom_mtt(dev, mem); return 0; @@ -575,11 +633,8 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem) { - if (mem->mtt_buf) { - dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0], - MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); - free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); - } + if (mem->mtt) + erdma_destroy_mtt(dev, mem->mtt); if (mem->umem) { ib_umem_release(mem->umem); @@ -875,33 +930,20 @@ struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, mr->mem.page_size = PAGE_SIZE; /* update it later. */ mr->mem.page_cnt = max_num_sg; - mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT; - mr->mem.mtt_buf = - alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL); - if (!mr->mem.mtt_buf) { - ret = -ENOMEM; + mr->mem.mtt = erdma_create_mtt(dev, MTT_SIZE(max_num_sg), true); + if (IS_ERR(mr->mem.mtt)) { + ret = PTR_ERR(mr->mem.mtt); goto out_remove_stag; } - mr->mem.mtt_entry[0] = - dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf, - MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); - if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) { - ret = -ENOMEM; - goto out_free_mtt; - } - ret = regmr_cmd(dev, mr); if (ret) - goto out_dma_unmap; + goto out_destroy_mtt; return &mr->ibmr; -out_dma_unmap: - dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0], - MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); -out_free_mtt: - free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt)); +out_destroy_mtt: + erdma_destroy_mtt(dev, mr->mem.mtt); out_remove_stag: erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], @@ -920,7 +962,7 @@ static int erdma_set_page(struct ib_mr *ibmr, u64 addr) if (mr->mem.mtt_nents >= mr->mem.page_cnt) return -1; - *((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr; + mr->mem.mtt->buf[mr->mem.mtt_nents] = addr; mr->mem.mtt_nents++; return 0; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index abaf031fe0d2..5f639f27a8a9 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -65,7 +65,7 @@ struct erdma_pd { * MemoryRegion definition. */ #define ERDMA_MAX_INLINE_MTT_ENTRIES 4 -#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt entry takes 8 Bytes. */ +#define MTT_SIZE(mtt_cnt) ((mtt_cnt) << 3) /* per mtt entry takes 8 Bytes. */ #define ERDMA_MR_MAX_MTT_CNT 524288 #define ERDMA_MTT_ENTRY_SIZE 8 @@ -90,10 +90,28 @@ static inline u8 to_erdma_access_flags(int access) (access & IB_ACCESS_REMOTE_ATOMIC ? ERDMA_MR_ACC_RA : 0); } +/* Hierarchical storage structure for MTT entries */ +struct erdma_mtt { + u64 *buf; + size_t size; + + bool continuous; + union { + dma_addr_t buf_dma; + struct { + struct scatterlist *sglist; + u32 nsg; + u32 level; + }; + }; + + struct erdma_mtt *low_level; +}; + struct erdma_mem { struct ib_umem *umem; - void *mtt_buf; - u32 mtt_type; + struct erdma_mtt *mtt; + u32 page_size; u32 page_offset; u32 page_cnt; @@ -101,8 +119,6 @@ struct erdma_mem { u64 va; u64 len; - - u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES]; }; struct erdma_mr { -- 2.31.1