From: Artemy Kovalyov <artemyko@xxxxxxxxxxxx> Implement ibv_post_srq_ops verb to perform tag matching list add/remove operations. To support this, the patch adds tag state tracking infrastructure to manage the list operation and their coming completions as described below. Vector of struct mlx5_tag_entry (mlx5_srq.tm_list) shadows HW tag matching list. Initially all tags are held in a free tag list FIFO (mlx5_srq.tm_head, mlx5_srq.tm_tail). FIFO of operations waiting for CQE (mlx5_srq.op, op_head, op_tail) allows to match received list operation completions with tag entry basing of guaranteed ordering of command QP work request processing. TM data receive completions have tag index in CQE and may be accessed via vector by this index. List add operation: * removes a tag from head of free tag list FIFO * increments expected cqe (for tag consumed completion) * if (SIGNALED) * store tag wr_id, phase_cnt (unexpected_cnt) if provided * enqueues new op to the tail of waiting for CQE FIFO * store op wr_id, wqe_head * increments expected cqe * posts list add operation work request on command QP * returns index of tag to caller List remove operation: * looks up tag by index in tag list vector * if (SIGNALED) * enqueues new op to the tail of waiting for CQE FIFO * store op wr_id, wqe_head * increments expected cqe * posts list remove operation work request on command QP Signed-off-by: Artemy Kovalyov <artemyko@xxxxxxxxxxxx> Reviewed-by: Yishai Hadas <yishaih@xxxxxxxxxxxx> --- providers/mlx5/mlx5.c | 1 + providers/mlx5/mlx5.h | 48 +++++++- providers/mlx5/mlx5dv.h | 12 ++ providers/mlx5/qp.c | 303 +++++++++++++++++++++++++++++++++++++++++------- providers/mlx5/verbs.c | 25 +++- 5 files changed, 344 insertions(+), 45 deletions(-) diff --git a/providers/mlx5/mlx5.c b/providers/mlx5/mlx5.c index 15f258d..2414881 100644 --- a/providers/mlx5/mlx5.c +++ b/providers/mlx5/mlx5.c @@ -965,6 +965,7 @@ static int mlx5_init_context(struct verbs_device *vdev, verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); + verbs_set_ctx_op(v_ctx, post_srq_ops, mlx5_post_srq_ops); memset(&device_attr, 0, sizeof(device_attr)); if (!mlx5_query_device_ex(ctx, NULL, &device_attr, diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index d16b5f5..d7eb477 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -132,6 +132,12 @@ enum { }; enum { + MLX5_TM_OPCODE_NOP = 0x00, + MLX5_TM_OPCODE_APPEND = 0x01, + MLX5_TM_OPCODE_REMOVE = 0x02, +}; + +enum { MLX5_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, MLX5_RECV_OPCODE_SEND = 0x01, MLX5_RECV_OPCODE_SEND_IMM = 0x02, @@ -142,7 +148,9 @@ enum { }; enum { - MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, + MLX5_SRQ_FLAG_SIGNATURE = (1 << 0), + MLX5_SRQ_FLAG_TM_SW_CNT = (1 << 6), + MLX5_SRQ_FLAG_TM_CQE_REQ = (1 << 7), }; enum { @@ -345,6 +353,22 @@ struct mlx5_cq { int umr_opcode; }; +struct mlx5_tag_entry { + struct mlx5_tag_entry *next; + uint64_t wr_id; + int phase_cnt; + void *ptr; + uint32_t size; + int8_t expect_cqe; +}; + +struct mlx5_srq_op { + struct mlx5_tag_entry *tag; + uint64_t wr_id; + /* we need to advance tail pointer */ + uint32_t wqe_head; +}; + struct mlx5_srq { struct mlx5_resource rsc; /* This struct must be first */ struct verbs_srq vsrq; @@ -361,8 +385,27 @@ struct mlx5_srq { uint16_t counter; int wq_sig; struct ibv_qp *cmd_qp; + struct mlx5_tag_entry *tm_list; /* vector of all tags */ + struct mlx5_tag_entry *tm_head; /* queue of free tags */ + struct mlx5_tag_entry *tm_tail; + struct mlx5_srq_op *op; + int op_head; + int op_tail; + int unexp_in; + int unexp_out; }; + +static inline void mlx5_tm_release_tag(struct mlx5_srq *srq, + struct mlx5_tag_entry *tag) +{ + if (!--tag->expect_cqe) { + tag->next = NULL; + srq->tm_tail->next = tag; + srq->tm_tail = tag; + } +} + struct wr_list { uint16_t opcode; uint16_t next; @@ -693,6 +736,9 @@ struct ibv_rwq_ind_table *mlx5_create_rwq_ind_table(struct ibv_context *context, int mlx5_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table); struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr); +int mlx5_post_srq_ops(struct ibv_srq *srq, + struct ibv_ops_wr *wr, + struct ibv_ops_wr **bad_wr); static inline void *mlx5_find_uidx(struct mlx5_context *ctx, uint32_t uidx) { diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index ffe2c55..e6cb610 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -217,6 +217,7 @@ enum { MLX5_OPCODE_LOCAL_INVAL = 0x1b, MLX5_OPCODE_CONFIG_CMD = 0x1f, MLX5_OPCODE_UMR = 0x25, + MLX5_OPCODE_TAG_MATCHING = 0x28 }; /* @@ -451,6 +452,17 @@ struct mlx5_wqe_eth_seg { uint8_t inline_hdr[16]; }; +struct mlx5_wqe_tm_seg { + uint8_t opcode; + uint8_t flags; + __be16 index; + uint8_t rsvd0[2]; + __be16 sw_cnt; + uint8_t rsvd1[8]; + __be64 append_tag; + __be64 append_mask; +}; + /* * Control segment - contains some control information for the current WQE. * diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c index d5e677f..20e97e4 100644 --- a/providers/mlx5/qp.c +++ b/providers/mlx5/qp.c @@ -194,6 +194,26 @@ static inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, rseg->reserved = 0; } +static void set_tm_seg(struct mlx5_wqe_tm_seg *tmseg, int op, + struct ibv_ops_wr *wr, int index) +{ + tmseg->flags = 0; + if (wr->flags & IBV_OPS_SIGNALED) + tmseg->flags |= MLX5_SRQ_FLAG_TM_CQE_REQ; + if (wr->flags & IBV_OPS_TM_SYNC) { + tmseg->flags |= MLX5_SRQ_FLAG_TM_SW_CNT; + tmseg->sw_cnt = htobe16(wr->tm.unexpected_cnt); + } + tmseg->opcode = op << 4; + if (op == MLX5_TM_OPCODE_NOP) + return; + tmseg->index = htobe16(index); + if (op == MLX5_TM_OPCODE_REMOVE) + return; + tmseg->append_tag = htobe64(wr->tm.add.tag); + tmseg->append_mask = htobe64(wr->tm.add.mask); +} + static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, enum ibv_wr_opcode opcode, uint64_t swap, @@ -231,6 +251,13 @@ static void set_data_ptr_seg_atomic(struct mlx5_wqe_data_seg *dseg, dseg->addr = htobe64(sg->addr); } +static void set_data_ptr_seg_end(struct mlx5_wqe_data_seg *dseg) +{ + dseg->byte_count = 0; + dseg->lkey = htobe32(MLX5_INVALID_LKEY); + dseg->addr = 0; +} + /* * Avoid using memcpy() to copy to BlueFlame page, since memcpy() * implementations may use move-string-buffer assembler instructions, @@ -661,10 +688,61 @@ static inline int mlx5_post_send_underlay(struct mlx5_qp *qp, struct ibv_send_wr return 0; } +static inline void post_send_db(struct mlx5_qp *qp, struct mlx5_bf *bf, + int nreq, int inl, int size, + uint8_t next_fence, void *ctrl) +{ + struct mlx5_context *ctx; + + if (unlikely(!nreq)) + return; + + qp->sq.head += nreq; + qp->fm_cache = next_fence; + + /* + * Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + udma_to_device_barrier(); + qp->db[MLX5_SND_DBR] = htobe32(qp->sq.cur_post & 0xffff); + + /* Make sure that the doorbell write happens before the memcpy + * to WC memory below + */ + ctx = to_mctx(qp->ibv_qp->context); + if (bf->need_lock) + mmio_wc_spinlock(&bf->lock.lock); + else + mmio_wc_start(); + + if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn && + (inl || ctx->prefer_bf) && size > 1 && + size <= bf->buf_size / 16) + mlx5_bf_copy(bf->reg + bf->offset, ctrl, + align(size * 16, 64), qp); + else + mmio_write64_be(bf->reg + bf->offset, *(__be64 *)ctrl); + + /* + * use mmio_flush_writes() to ensure write combining buffers are + * flushed out of the running CPU. This must be carried inside + * the spinlock. Otherwise, there is a potential race. In the + * race, CPU A writes doorbell 1, which is waiting in the WC + * buffer. CPU B writes doorbell 2, and it's write is flushed + * earlier. Since the mmio_flush_writes is CPU local, this will + * result in the HCA seeing doorbell 2, followed by doorbell 1. + * Flush before toggling bf_offset to be latency oriented. + */ + mmio_flush_writes(); + bf->offset ^= bf->buf_size; + if (bf->need_lock) + mlx5_spin_unlock(&bf->lock); +} + static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { - struct mlx5_context *ctx; struct mlx5_qp *qp = to_mqp(ibqp); void *seg; struct mlx5_wqe_eth_seg *eseg; @@ -977,48 +1055,7 @@ static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, } out: - if (likely(nreq)) { - qp->sq.head += nreq; - qp->fm_cache = next_fence; - - /* - * Make sure that descriptors are written before - * updating doorbell record and ringing the doorbell - */ - udma_to_device_barrier(); - qp->db[MLX5_SND_DBR] = htobe32(qp->sq.cur_post & 0xffff); - - /* Make sure that the doorbell write happens before the memcpy - * to WC memory below */ - ctx = to_mctx(ibqp->context); - if (bf->need_lock) - mmio_wc_spinlock(&bf->lock.lock); - else - mmio_wc_start(); - - if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn && - (inl || ctx->prefer_bf) && size > 1 && - size <= bf->buf_size / 16) - mlx5_bf_copy(bf->reg + bf->offset, (uint64_t *)ctrl, - align(size * 16, 64), qp); - else - mmio_write64_be(bf->reg + bf->offset, *(__be64 *)ctrl); - - /* - * use mmio_flush_writes() to ensure write combining buffers are flushed out - * of the running CPU. This must be carried inside the spinlock. - * Otherwise, there is a potential race. In the race, CPU A - * writes doorbell 1, which is waiting in the WC buffer. CPU B - * writes doorbell 2, and it's write is flushed earlier. Since - * the mmio_flush_writes is CPU local, this will result in the HCA seeing - * doorbell 2, followed by doorbell 1. - * Flush before toggling bf_offset to be latency oriented. - */ - mmio_flush_writes(); - bf->offset ^= bf->buf_size; - if (bf->need_lock) - mlx5_spin_unlock(&bf->lock); - } + post_send_db(qp, bf, nreq, inl, size, next_fence, ctrl); mlx5_spin_unlock(&qp->sq.lock); @@ -1275,6 +1312,186 @@ out: return err; } +static void mlx5_tm_add_op(struct mlx5_srq *srq, struct mlx5_tag_entry *tag, + uint64_t wr_id, int nreq) +{ + struct mlx5_qp *qp = to_mqp(srq->cmd_qp); + struct mlx5_srq_op *op; + + op = srq->op + (srq->op_tail++ & (qp->sq.wqe_cnt - 1)); + op->tag = tag; + op->wr_id = wr_id; + /* Will point to next available WQE */ + op->wqe_head = qp->sq.head + nreq; + if (tag) + tag->expect_cqe++; +} + +int mlx5_post_srq_ops(struct ibv_srq *ibsrq, struct ibv_ops_wr *wr, + struct ibv_ops_wr **bad_wr) +{ + struct mlx5_context *ctx = to_mctx(ibsrq->context); + struct mlx5_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_ctrl_seg *ctrl = NULL; + struct mlx5_tag_entry *tag; + struct mlx5_bf *bf; + struct mlx5_qp *qp; + unsigned int idx; + int size = 0; + int nreq = 0; + int err = 0; + void *qend; + void *seg; + FILE *fp = ctx->dbg_fp; + + if (unlikely(!srq->cmd_qp)) { + *bad_wr = wr; + return EINVAL; + } + + qp = to_mqp(srq->cmd_qp); + bf = qp->bf; + qend = qp->sq.qend; + mlx5_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, + to_mcq(qp->ibv_qp->send_cq)))) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "work queue overflow\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } + + idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + ctrl = seg = mlx5_get_send_wqe(qp, idx); + *(uint32_t *)(seg + 8) = 0; + ctrl->imm = 0; + ctrl->fm_ce_se = 0; + + seg += sizeof(*ctrl); + size = sizeof(*ctrl) / 16; + + switch (wr->opcode) { + case IBV_WR_TAG_ADD: + if (unlikely(!srq->tm_head->next)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "tag matching list is full\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } + tag = srq->tm_head; +#ifdef MLX5_DEBUG + if (wr->tm.add.num_sge > 1) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "num_sge must be at most 1\n"); + err = EINVAL; + *bad_wr = wr; + goto out; + } + + if (tag->expect_cqe) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "tag matching list is corrupted\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } +#endif + srq->tm_head = tag->next; + /* place index of next entry into TM segment */ + set_tm_seg(seg, MLX5_TM_OPCODE_APPEND, wr, + tag->next - srq->tm_list); + tag->next = NULL; + tag->wr_id = wr->tm.add.recv_wr_id; + if (wr->flags & IBV_OPS_TM_SYNC) + srq->unexp_out = wr->tm.unexpected_cnt; + tag->phase_cnt = srq->unexp_out; + tag->expect_cqe++; + + if (wr->flags & IBV_OPS_SIGNALED) + mlx5_tm_add_op(srq, tag, wr->wr_id, nreq); + + wr->tm.handle = tag - srq->tm_list; + seg += sizeof(struct mlx5_wqe_tm_seg); + size += sizeof(struct mlx5_wqe_tm_seg) / 16; + + if (unlikely(seg == qend)) + seg = mlx5_get_send_wqe(qp, 0); + + /* message is allowed to be empty */ + if (wr->tm.add.num_sge && wr->tm.add.sg_list->length) { + set_data_ptr_seg(seg, wr->tm.add.sg_list, 0); + tag->ptr = (void *)(uintptr_t)wr->tm.add.sg_list->addr; + tag->size = wr->tm.add.sg_list->length; + } else { + set_data_ptr_seg_end(seg); + } + size += sizeof(struct mlx5_wqe_data_seg) / 16; + break; + + case IBV_WR_TAG_DEL: + tag = &srq->tm_list[wr->tm.handle]; + +#ifdef MLX5_DEBUG + if (!tag->expect_cqe) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "removing tag which isn't in HW ownership\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } +#endif + set_tm_seg(seg, MLX5_TM_OPCODE_REMOVE, wr, + wr->tm.handle); + + if (wr->flags & IBV_OPS_SIGNALED) + mlx5_tm_add_op(srq, tag, wr->wr_id, nreq); + else + mlx5_tm_release_tag(srq, tag); + + seg += sizeof(struct mlx5_wqe_tm_seg); + size += sizeof(struct mlx5_wqe_tm_seg) / 16; + break; + + case IBV_WR_TAG_SYNC: + set_tm_seg(seg, MLX5_TM_OPCODE_NOP, wr, 0); + + if (wr->flags & IBV_OPS_SIGNALED) + mlx5_tm_add_op(srq, NULL, wr->wr_id, nreq); + + seg += sizeof(struct mlx5_wqe_tm_seg); + size += sizeof(struct mlx5_wqe_tm_seg) / 16; + break; + + default: + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "bad opcode %d\n", + wr->opcode); + err = EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl->opmod_idx_opcode = htobe32(MLX5_OPCODE_TAG_MATCHING | + ((qp->sq.cur_post & 0xffff) << 8)); + ctrl->qpn_ds = htobe32(size | (srq->cmd_qp->qp_num << 8)); + + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + +#ifdef MLX5_DEBUG + if (mlx5_debug_mask & MLX5_DBG_QP_SEND) + dump_wqe(fp, idx, size, qp); +#endif + } + +out: + post_send_db(qp, bf, nreq, 0, size, 0, ctrl); + + mlx5_spin_unlock(&srq->lock); + + return err; +} + int mlx5_use_huge(const char *key) { char *e; diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index a26b631..f935fc8 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -757,7 +757,9 @@ int mlx5_destroy_srq(struct ibv_srq *srq) mlx5_free_db(ctx, msrq->db); mlx5_free_buf(&msrq->buf); + free(msrq->tm_list); free(msrq->wrid); + free(msrq->op); free(msrq); return 0; @@ -2025,15 +2027,33 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, goto err_free_uidx; if (attr->srq_type == IBV_SRQT_TM) { + int i; + msrq->cmd_qp = create_cmd_qp(context, attr, ibsrq); if (!msrq->cmd_qp) goto err_destroy; + + msrq->tm_list = calloc(attr->tm_cap.max_num_tags + 1, + sizeof(struct mlx5_tag_entry)); + if (!msrq->tm_list) + goto err_free_cmd; + for (i = 0; i < attr->tm_cap.max_num_tags; i++) + msrq->tm_list[i].next = &msrq->tm_list[i + 1]; + msrq->tm_head = &msrq->tm_list[0]; + msrq->tm_tail = &msrq->tm_list[attr->tm_cap.max_num_tags]; + + msrq->op = calloc(to_mqp(msrq->cmd_qp)->sq.wqe_cnt, + sizeof(struct mlx5_srq_op)); + if (!msrq->op) + goto err_free_tm; + msrq->op_head = 0; + msrq->op_tail = 0; } if (!ctx->cqe_version) { err = mlx5_store_srq(to_mctx(context), resp.srqn, msrq); if (err) - goto err_free_cmd; + goto err_free_tm; pthread_mutex_unlock(&ctx->srq_table_mutex); } @@ -2044,6 +2064,9 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, return ibsrq; +err_free_tm: + free(msrq->tm_list); + free(msrq->op); err_free_cmd: if (msrq->cmd_qp) mlx5_destroy_qp(msrq->cmd_qp); -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html