From: Ariel Levkovich <lariel@xxxxxxxxxxxx> This patch adds the support for creating an extended CQ. This means we support: - The new polling mechanism. - A CQ which is single threaded and by thus doesn't waste CPU cycles on locking. - Getting completion timestamp from the CQ. Signed-off-by: Ariel Levkovich <lariel@xxxxxxxxxxxx> Acked-by: Yishai Hadas <yishaih@xxxxxxxxxxxx> --- providers/mlx4/cq.c | 33 ++++++++++ providers/mlx4/mlx4-abi.h | 12 ++++ providers/mlx4/mlx4.c | 1 + providers/mlx4/mlx4.h | 5 ++ providers/mlx4/verbs.c | 157 ++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 190 insertions(+), 18 deletions(-) diff --git a/providers/mlx4/cq.c b/providers/mlx4/cq.c index 728efde..22fdbf2 100644 --- a/providers/mlx4/cq.c +++ b/providers/mlx4/cq.c @@ -646,6 +646,39 @@ static inline uint64_t mlx4_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq) (cq->cqe->ts_7_0); } +void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr) +{ + + if (cq->flags & MLX4_CQ_FLAGS_SINGLE_THREADED) { + cq->ibv_cq.start_poll = mlx4_start_poll; + cq->ibv_cq.end_poll = mlx4_end_poll; + } else { + cq->ibv_cq.start_poll = mlx4_start_poll_lock; + cq->ibv_cq.end_poll = mlx4_end_poll_lock; + } + cq->ibv_cq.next_poll = mlx4_next_poll; + + cq->ibv_cq.read_opcode = mlx4_cq_read_wc_opcode; + cq->ibv_cq.read_vendor_err = mlx4_cq_read_wc_vendor_err; + cq->ibv_cq.read_wc_flags = mlx4_cq_read_wc_flags; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + cq->ibv_cq.read_byte_len = mlx4_cq_read_wc_byte_len; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM) + cq->ibv_cq.read_imm_data = mlx4_cq_read_wc_imm_data; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) + cq->ibv_cq.read_qp_num = mlx4_cq_read_wc_qp_num; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) + cq->ibv_cq.read_src_qp = mlx4_cq_read_wc_src_qp; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID) + cq->ibv_cq.read_slid = mlx4_cq_read_wc_slid; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL) + cq->ibv_cq.read_sl = mlx4_cq_read_wc_sl; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + cq->ibv_cq.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) + cq->ibv_cq.read_completion_ts = mlx4_cq_read_wc_completion_ts; +} + int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) { struct mlx4_cq *cq = to_mcq(ibvcq); diff --git a/providers/mlx4/mlx4-abi.h b/providers/mlx4/mlx4-abi.h index ac21fa8..3b8bac5 100644 --- a/providers/mlx4/mlx4-abi.h +++ b/providers/mlx4/mlx4-abi.h @@ -78,6 +78,18 @@ struct mlx4_create_cq_resp { __u32 reserved; }; +struct mlx4_create_cq_ex { + struct ibv_create_cq_ex ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_cq_resp_ex { + struct ibv_create_cq_resp_ex ibv_resp; + __u32 cqn; + __u32 reserved; +}; + struct mlx4_resize_cq { struct ibv_resize_cq ibv_cmd; __u64 buf_addr; diff --git a/providers/mlx4/mlx4.c b/providers/mlx4/mlx4.c index b59c202..3f29d1a 100644 --- a/providers/mlx4/mlx4.c +++ b/providers/mlx4/mlx4.c @@ -216,6 +216,7 @@ static int mlx4_init_context(struct verbs_device *v_device, verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp); verbs_set_ctx_op(verbs_ctx, ibv_create_flow, ibv_cmd_create_flow); verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); + verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex); return 0; diff --git a/providers/mlx4/mlx4.h b/providers/mlx4/mlx4.h index cb4c8d4..9d43b63 100644 --- a/providers/mlx4/mlx4.h +++ b/providers/mlx4/mlx4.h @@ -189,6 +189,8 @@ struct mlx4_pd { enum { MLX4_CQ_FLAGS_RX_CSUM_VALID = 1 << 0, + MLX4_CQ_FLAGS_EXTENDED = 1 << 1, + MLX4_CQ_FLAGS_SINGLE_THREADED = 1 << 2, }; struct mlx4_cq { @@ -396,6 +398,9 @@ int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); +struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr); +void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr); int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, int entry_size); int mlx4_resize_cq(struct ibv_cq *cq, int cqe); diff --git a/providers/mlx4/verbs.c b/providers/mlx4/verbs.c index 83c971d..4b00550 100644 --- a/providers/mlx4/verbs.c +++ b/providers/mlx4/verbs.c @@ -304,19 +304,103 @@ int align_queue_size(int req) return nent; } -struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector) +enum { + CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP +}; + +enum { + CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS +}; + +enum { + CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED +}; + + +static int mlx4_cmd_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx4_cq *cq) +{ + struct mlx4_create_cq cmd = {}; + struct mlx4_create_cq_resp resp = {}; + int ret; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel, + cq_attr->comp_vector, + ibv_cq_ex_to_cq(&cq->ibv_cq), + &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (!ret) + cq->cqn = resp.cqn; + + return ret; + +} + +static int mlx4_cmd_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx4_cq *cq) +{ + struct mlx4_create_cq_ex cmd = {}; + struct mlx4_create_cq_resp_ex resp = {}; + int ret; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq_ex(context, cq_attr, + &cq->ibv_cq, &cmd.ibv_cmd, + sizeof(cmd.ibv_cmd), + sizeof(cmd), + &resp.ibv_resp, + sizeof(resp.ibv_resp), + sizeof(resp)); + if (!ret) + cq->cqn = resp.cqn; + + return ret; +} + +static struct ibv_cq_ex *create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + int cq_alloc_flags) { - struct mlx4_create_cq cmd; - struct mlx4_create_cq_resp resp; - struct mlx4_cq *cq; - int ret; - struct mlx4_context *mctx = to_mctx(context); + struct mlx4_cq *cq; + int ret; + struct mlx4_context *mctx = to_mctx(context); /* Sanity check CQ size before proceeding */ - if (cqe > 0x3fffff) + if (cq_attr->cqe > 0x3fffff) { + errno = EINVAL; + return NULL; + } + + if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) { + errno = ENOTSUP; + return NULL; + } + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) { + errno = ENOTSUP; + return NULL; + } + + if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) + return NULL; + + /* mlx4 devices don't support slid and sl in cqe when completion + * timestamp is enabled in the CQ + */ + if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) && + (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) { + errno = ENOTSUP; return NULL; + } cq = malloc(sizeof *cq); if (!cq) @@ -327,9 +411,9 @@ struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) goto err; - cqe = align_queue_size(cqe + 1); + cq_attr->cqe = align_queue_size(cq_attr->cqe + 1); - if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size)) + if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size)) goto err; cq->cqe_size = mctx->cqe_size; @@ -341,19 +425,26 @@ struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, *cq->arm_db = 0; cq->arm_sn = 1; *cq->set_ci_db = 0; + cq->flags = cq_alloc_flags; - cmd.buf_addr = (uintptr_t) cq->buf.buf; - cmd.db_addr = (uintptr_t) cq->set_ci_db; + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) + cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED; + + --cq_attr->cqe; + if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) + ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq); + else + ret = mlx4_cmd_create_cq(context, cq_attr, cq); - ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector, - ibv_cq_ex_to_cq(&cq->ibv_cq), &cmd.ibv_cmd, sizeof(cmd), - &resp.ibv_resp, sizeof(resp)); if (ret) goto err_db; - cq->cqn = resp.cqn; - return ibv_cq_ex_to_cq(&cq->ibv_cq); + if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) + mlx4_cq_fill_pfns(cq, cq_attr); + + return &cq->ibv_cq; err_db: mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); @@ -367,6 +458,36 @@ err: return NULL; } +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq_ex *cq; + struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel, + .comp_vector = comp_vector, + .wc_flags = IBV_WC_STANDARD_FLAGS}; + + cq = create_cq(context, &cq_attr, 0); + return cq ? ibv_cq_ex_to_cq(cq) : NULL; +} + +struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr) +{ + /* + * Make local copy since some attributes might be adjusted + * for internal use. + */ + struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe, + .channel = cq_attr->channel, + .comp_vector = cq_attr->comp_vector, + .wc_flags = cq_attr->wc_flags, + .comp_mask = cq_attr->comp_mask, + .flags = cq_attr->flags}; + + return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED); +} + int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe) { struct mlx4_cq *cq = to_mcq(ibcq); -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html