Introduce mlx5dv_wr_mr_list() post send builder to be used for issuing a WR that may register a memory layout which is based on list of ibv_sge. Reviewed-by: Artemy Kovalyov <artemyko@xxxxxxxxxxxx> Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxxxx> --- providers/mlx5/man/mlx5dv_create_qp.3.md | 3 + providers/mlx5/man/mlx5dv_wr_post.3.md | 20 +++++++ providers/mlx5/mlx5dv.h | 15 +++++ providers/mlx5/qp.c | 97 +++++++++++++++++++++++++++----- providers/mlx5/verbs.c | 3 +- 5 files changed, 122 insertions(+), 16 deletions(-) diff --git a/providers/mlx5/man/mlx5dv_create_qp.3.md b/providers/mlx5/man/mlx5dv_create_qp.3.md index 74a2193..856c69a 100644 --- a/providers/mlx5/man/mlx5dv_create_qp.3.md +++ b/providers/mlx5/man/mlx5dv_create_qp.3.md @@ -104,6 +104,9 @@ struct mlx5dv_dc_init_attr { MLX5DV_QP_EX_WITH_MR_INTERLEAVED: Enables the mlx5dv_wr_mr_interleaved() work requset on this QP. + MLX5DV_QP_EX_WITH_MR_LIST: + Enables the mlx5dv_wr_mr_list() work requset on this QP. + # NOTES **mlx5dv_qp_ex_from_ibv_qp_ex()** is used to get *struct mlx5dv_qp_ex* for diff --git a/providers/mlx5/man/mlx5dv_wr_post.3.md b/providers/mlx5/man/mlx5dv_wr_post.3.md index 42e680c..0f7ff4e 100644 --- a/providers/mlx5/man/mlx5dv_wr_post.3.md +++ b/providers/mlx5/man/mlx5dv_wr_post.3.md @@ -36,6 +36,12 @@ static inline void mlx5dv_wr_mr_interleaved(struct mlx5dv_qp_ex *mqp, uint32_t repeat_count, uint16_t num_interleaved, struct mlx5dv_mr_interleaved *data); + +static inline void mlx5dv_wr_mr_list(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint16_t num_sges, + struct ibv_sge *sge); ``` # DESCRIPTION @@ -80,6 +86,20 @@ man for ibv_wr_post and mlx5dv_qp with its available builders and setters. In case *ibv_qp_ex->wr_flags* turns on IBV_SEND_SIGNALED, the reported WC opcode will be MLX5DV_WC_UMR. Unregister the *mkey* to enable another pattern registration should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. +: *mlx5dv_wr_mr_list()* + + registers a memory layout based on list of ibv_sge. + The layout of the memory pointed by the *mkey* after its registration will be based on the list of *sge* counted by *num_sges*. + Post a successful registration RDMA operations can use this *mkey*, the hardware will scatter the data according to the pattern. + The *mkey* should be used in a zero-based mode, the *addr* field in its *ibv_sge* is an offset in the total data. + + Current implementation requires the IBV_SEND_INLINE option to be on in *ibv_qp_ex->wr_flags* field. + To be able to have more than 4 *num_sge* entries, the QP should be created with a larger WQE size that may fit it. + This should be done using the *max_inline_data* attribute of *struct ibv_qp_cap* upon its creation. + + In case *ibv_qp_ex->wr_flags* turns on IBV_SEND_SIGNALED, the reported WC opcode will be MLX5DV_WC_UMR. + Unregister the *mkey* to enable other pattern registration should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. + ## QP Specific setters *DCI* QPs diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index c5aae57..8b88026 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -204,6 +204,7 @@ struct mlx5dv_dc_init_attr { enum mlx5dv_qp_create_send_ops_flags { MLX5DV_QP_EX_WITH_MR_INTERLEAVED = 1 << 0, + MLX5DV_QP_EX_WITH_MR_LIST = 1 << 1, }; struct mlx5dv_qp_init_attr { @@ -242,6 +243,11 @@ struct mlx5dv_qp_ex { uint32_t repeat_count, uint16_t num_interleaved, struct mlx5dv_mr_interleaved *data); + void (*wr_mr_list)(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint16_t num_sges, + struct ibv_sge *sge); }; struct mlx5dv_qp_ex *mlx5dv_qp_ex_from_ibv_qp_ex(struct ibv_qp_ex *qp); @@ -265,6 +271,15 @@ static inline void mlx5dv_wr_mr_interleaved(struct mlx5dv_qp_ex *mqp, num_interleaved, data); } +static inline void mlx5dv_wr_mr_list(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint16_t num_sges, + struct ibv_sge *sge) +{ + mqp->wr_mr_list(mqp, mkey, access_flags, num_sges, sge); +} + enum mlx5dv_flow_action_esp_mask { MLX5DV_FLOW_ACTION_ESP_MASK_FLAGS = 1 << 0, }; diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c index ecfe844..7707c2f 100644 --- a/providers/mlx5/qp.c +++ b/providers/mlx5/qp.c @@ -2059,6 +2059,40 @@ static uint8_t get_umr_mr_flags(uint32_t acc) MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE : 0)); } +static int umr_sg_list_create(struct mlx5_qp *qp, + uint16_t num_sges, + struct ibv_sge *sge, + void *seg, + void *qend, int *size, int *xlat_size, + uint64_t *reglen) +{ + struct mlx5_wqe_data_seg *dseg; + int byte_count = 0; + int i; + size_t tmp; + + dseg = seg; + + for (i = 0; i < num_sges; i++, dseg++) { + if (unlikely(dseg == qend)) + dseg = mlx5_get_send_wqe(qp, 0); + + dseg->addr = htobe64(sge[i].addr); + dseg->lkey = htobe32(sge[i].lkey); + dseg->byte_count = htobe32(sge[i].length); + byte_count += sge[i].length; + } + + tmp = align(num_sges, 4) - num_sges; + memset(dseg, 0, tmp * sizeof(*dseg)); + + *size = align(num_sges * sizeof(*dseg), 64); + *reglen = byte_count; + *xlat_size = num_sges * sizeof(*dseg); + + return 0; +} + /* The strided block format is as the following: * | repeat_block | entry_block | entry_block |...| entry_block | * While the repeat entry contains details on the list of the block_entries. @@ -2109,12 +2143,13 @@ static void umr_strided_seg_create(struct mlx5_qp *qp, *xlat_size = (num_interleaved + 1) * sizeof(*eb); } -static void mlx5_send_wr_mr_interleaved(struct mlx5dv_qp_ex *dv_qp, - struct mlx5dv_mkey *dv_mkey, - uint32_t access_flags, - uint32_t repeat_count, - uint16_t num_interleaved, - struct mlx5dv_mr_interleaved *data) +static void mlx5_send_wr_mr(struct mlx5dv_qp_ex *dv_qp, + struct mlx5dv_mkey *dv_mkey, + uint32_t access_flags, + uint32_t repeat_count, + uint16_t num_entries, + struct mlx5dv_mr_interleaved *data, + struct ibv_sge *sge) { struct mlx5_qp *mqp = mqp_from_mlx5dv_qp_ex(dv_qp); struct ibv_qp_ex *ibqp = &mqp->verbs_qp.qp_ex; @@ -2134,12 +2169,17 @@ static void mlx5_send_wr_mr_interleaved(struct mlx5dv_qp_ex *dv_qp, return; } - max_entries = min_t(size_t, - (mqp->max_inline_data + sizeof(struct mlx5_wqe_inl_data_seg)) / - sizeof(struct mlx5_wqe_umr_repeat_ent_seg) - 1, - mkey->num_desc); - - if (unlikely(num_interleaved > max_entries)) { + max_entries = data ? + min_t(size_t, + (mqp->max_inline_data + sizeof(struct mlx5_wqe_inl_data_seg)) / + sizeof(struct mlx5_wqe_umr_repeat_ent_seg) - 1, + mkey->num_desc) : + min_t(size_t, + (mqp->max_inline_data + sizeof(struct mlx5_wqe_inl_data_seg)) / + sizeof(struct mlx5_wqe_data_seg), + mkey->num_desc); + + if (unlikely(num_entries > max_entries)) { mqp->err = ENOMEM; return; } @@ -2184,8 +2224,13 @@ static void mlx5_send_wr_mr_interleaved(struct mlx5dv_qp_ex *dv_qp, if (unlikely(seg == qend)) seg = mlx5_get_send_wqe(mqp, 0); - umr_strided_seg_create(mqp, repeat_count, num_interleaved, data, - seg, qend, &size, &xlat_size, ®len); + if (data) + umr_strided_seg_create(mqp, repeat_count, num_entries, data, + seg, qend, &size, &xlat_size, ®len); + else + umr_sg_list_create(mqp, num_entries, sge, seg, + qend, &size, &xlat_size, ®len); + mk->len = htobe64(reglen); umr_ctrl_seg->klm_octowords = htobe16(align(xlat_size, 64) / 16); mqp->cur_size += size / 16; @@ -2197,6 +2242,26 @@ static void mlx5_send_wr_mr_interleaved(struct mlx5dv_qp_ex *dv_qp, _common_wqe_finilize(mqp); } +static void mlx5_send_wr_mr_interleaved(struct mlx5dv_qp_ex *dv_qp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data) +{ + mlx5_send_wr_mr(dv_qp, mkey, access_flags, repeat_count, + num_interleaved, data, NULL); +} + +static inline void mlx5_send_wr_mr_list(struct mlx5dv_qp_ex *dv_qp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint16_t num_sges, + struct ibv_sge *sge) +{ + mlx5_send_wr_mr(dv_qp, mkey, access_flags, 0, num_sges, NULL, sge); +} + static void mlx5_send_wr_set_dc_addr(struct mlx5dv_qp_ex *dv_qp, struct ibv_ah *ah, uint32_t remote_dctn, @@ -2343,11 +2408,13 @@ int mlx5_qp_fill_wr_pfns(struct mlx5_qp *mqp, if (mlx5_ops) { if (!check_comp_mask(mlx5_ops, - MLX5DV_QP_EX_WITH_MR_INTERLEAVED)) + MLX5DV_QP_EX_WITH_MR_INTERLEAVED | + MLX5DV_QP_EX_WITH_MR_LIST)) return EOPNOTSUPP; dv_qp = &mqp->dv_qp; dv_qp->wr_mr_interleaved = mlx5_send_wr_mr_interleaved; + dv_qp->wr_mr_list = mlx5_send_wr_mr_list; } break; diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 136c0d2..831ea46 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -1153,7 +1153,8 @@ static int _sq_overhead(struct mlx5_qp *qp, sizeof(struct mlx5_wqe_atomic_seg); if (ops & (IBV_QP_EX_WITH_BIND_MW | IBV_QP_EX_WITH_LOCAL_INV) || - (mlx5_ops & MLX5DV_QP_EX_WITH_MR_INTERLEAVED)) + (mlx5_ops & (MLX5DV_QP_EX_WITH_MR_INTERLEAVED | + MLX5DV_QP_EX_WITH_MR_LIST))) mw_size = sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_wqe_mkey_context_seg) + -- 1.8.3.1