Re: [PATCH rdma-next v2 07/11] RDMA/erdma: Add verbs implementation

Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> · Wed, 19 Jan 2022 11:26:47 +0800

On 1/18/22 10:06 PM, Bernard Metzler wrote:

<...>

+
+static const struct {
+	enum erdma_opcode erdma;
+	enum ib_wc_opcode base;
+} map_cqe_opcode[ERDMA_NUM_OPCODES] = {
+	{ ERDMA_OP_WRITE, IB_WC_RDMA_WRITE },
+	{ ERDMA_OP_READ, IB_WC_RDMA_READ },
+	{ ERDMA_OP_SEND, IB_WC_SEND },
+	{ ERDMA_OP_SEND_WITH_IMM, IB_WC_SEND },
+	{ ERDMA_OP_RECEIVE, IB_WC_RECV },
+	{ ERDMA_OP_RECV_IMM, IB_WC_RECV_RDMA_WITH_IMM },
+	{ ERDMA_OP_RECV_INV, IB_WC_LOCAL_INV }, /* confirm afterwards */
+	{ ERDMA_OP_REQ_ERR, IB_WC_RECV }, /* remove afterwards */
+	{ ERDNA_OP_READ_RESPONSE, IB_WC_RECV }, /* can not appear */
+	{ ERDMA_OP_WRITE_WITH_IMM, IB_WC_RDMA_WRITE },
+	{ ERDMA_OP_RECV_ERR, IB_WC_RECV_RDMA_WITH_IMM }, /* can not appear

I do not understand the four above /* comments */.
What does it say?

The definition comes from ERDMA hardware, and is used both in driver and
device. Some opcodes are defined but not used now (ERDMA_OP_RECV_INV,
ERDMA_OP_REQ_ERR, ERDMA_OP_RECV_ERR), and some are not reported to CQE
never (ERDNA_OP_READ_RESPONSE). I will find a better way to keep the
mapping table without the useless mapping entries.

*/
+	{ ERDMA_OP_INVALIDATE, IB_WC_LOCAL_INV },
+	{ ERDMA_OP_RSP_SEND_IMM, IB_WC_RECV },
+	{ ERDMA_OP_SEND_WITH_INV, IB_WC_SEND },
+	{ ERDMA_OP_REG_MR, IB_WC_REG_MR },
+	{ ERDMA_OP_LOCAL_INV, IB_WC_LOCAL_INV },
+	{ ERDMA_OP_READ_WITH_INV, IB_WC_RDMA_READ },
+};
+

<...>

+static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp)
+{
+	struct erdma_cmdq_create_qp_req req;
+	struct erdma_pd *pd = to_epd(qp->ibqp.pd);
+	struct erdma_uqp *user_qp;
+	int err;
+
+	ERDMA_CMDQ_BUILD_REQ_HDR(&req, CMDQ_SUBMOD_RDMA,
CMDQ_OPCODE_CREATE_QP);
+
+	req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK, ilog2(qp-
attrs.sq_size)) |
+		   FIELD_PREP(ERDMA_CMD_CREATE_QP_QPN_MASK, QP_ID(qp));
+	req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK, ilog2(qp-
attrs.rq_size)) |
+		   FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn);
+
+	if (qp->is_kernel_qp) {

use rdma_is_kernel_res(&qp->ibqp.res)

Will fix.

+		u32 pgsz_range = ilog2(SZ_1M) - PAGE_SHIFT;
+
+		req.sq_cqn_mtt_cfg =
FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, pgsz_range) |
+				     FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp-
scq->cqn);
+		req.rq_cqn_mtt_cfg =
FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, pgsz_range) |
+				     FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp-
rcq->cqn);

<...>

+	if (cq->is_kernel_cq) {

use rdma_is_kernel_res(&cq->ibcq.res)

Will fix.

+		page_size = SZ_32M;
+		req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
+				       ilog2(page_size) - PAGE_SHIFT);
+		req.qbuf_addr_l = lower_32_bits(cq->kern_cq.qbuf_dma_addr);
+		req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr);
+
+		req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
+			    FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
ERDMA_MR_INLINE_MTT);
+
+		req.first_page_offset = 0;
+		req.cq_db_info_addr = cq->kern_cq.qbuf_dma_addr + (cq->depth
<< CQE_SHIFT);
+	} else {
+		req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
+				       ilog2(cq->user_cq.qbuf_mtt.page_size) -
PAGE_SHIFT);

<...>

+	attr->state = dev->state;
+	attr->max_mtu = ib_mtu_int_to_enum(dev->netdev->mtu);
+	attr->active_mtu = ib_mtu_int_to_enum(dev->netdev->mtu);
+	attr->gid_tbl_len = 1;
+	attr->port_cap_flags = IB_PORT_CM_SUP;
+	attr->port_cap_flags |= IB_PORT_DEVICE_MGMT_SUP;

you may better write above two lines in one line

Will fix.

+	attr->max_msg_sz = -1;
+	attr->phys_state = dev->state == IB_PORT_ACTIVE ?
+		IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
+
+	return ret;
+}
+

<...>

+	if (max_num_sg > ERDMA_MR_MAX_MTT_CNT) {

Does erdma really accept 524288 list entries, or is
that a typo?

Yes, for 4K page size, 512K enties can achieve 2GByte Mmemory Region,
and this is our capacity. This is also been tested.

+		ibdev_err(&dev->ibdev, "max_num_sg too large:%u", max_num_sg);
+		return ERR_PTR(-EINVAL);
+	}
+

<...>

+	atomic_inc(&dev->num_mr);

This dev->num_mr thing gets initialized, incremented and decremented here
and there, but is never checked. So it has no meaning. It seems max MR is
enforced via max possible STags.

Yes, you get it, QP, CQ, and PD are in the same way also.

Same is true for devices num_qp, num_cq, num_pd counters. The intent probably
was to enforce resource limits. How is that done today?
In any case, they are never checked. So just remove that, and check if
resource limits are enforced correctly otherwise.

We will remove them.

+
+	return &mr->ibmr;
+
+err_out_mr:
+	erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], mr->ibmr.lkey
8);
+
+err_out_put_mtt:
+	put_mtt_entries(dev, &mr->mem);
+
+err_out_free:
+	kfree(mr);
+

<...>

+	if (depth > dev->attrs.max_cqe) {
+		dev_warn(dev->dmadev, "WARN: exceed cqe(%d) >
capbility(%d)\n", depth,
+			 dev->attrs.max_cqe);

That is not worth a warning. The application just exceeded some
Resource limit. Remove it.

Yes, will remove it. At first, it often happened that create cq failed
due to max cq depth exceed, so we add this warning. Now it should be
removed.

+		return -EINVAL;
+	}
+

<...>

+		cq->is_kernel_cq = 1;

Can be removed.

Sure, will fix.

Besides, thanks for your carefully review.
Cheng Xu