From: Moni Shoua <monis@xxxxxxxxxxxx> When allocating the WQE buffer try to allocate more space than required. The extra space will serve as a place for WQEs that were recently switched from HW ownership to SW ownership to cool down before being posted again. This is useful with WQEs with ODP buffers that were consumed by HW but weren't handled yet by the page-fault handler in kernel. The policy of the wait queue is FIFO so a WQE gets out the wait queue after N-1 WQEs get in when N is the size of the wait queue. WQEs in the wait queue are considered to be in SW ownership except they are not counted as candidates for posting. This means that WQEs in the wait queue aren't in HW ownership while there. Putting a WQE in the wait queue means that it's no longer available for posting. When that happens, another WQE in the wait queue needs to be taken out of there to replace it. Having a wait queue is not mandatory. If the extra resources that are required for the wait queue are beyond the limits of the system then the SRQ will operate without a wait queue. Signed-off-by: Moni Shoua <monis@xxxxxxxxxxxx> Reviewed-by: Artemy Kovalyov <artemyko@xxxxxxxxxxxx> Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxxxx> --- providers/mlx5/mlx5.h | 15 +++++++++++- providers/mlx5/srq.c | 63 +++++++++++++++++++++++++++++++++++++++----------- providers/mlx5/verbs.c | 33 ++++++++++++++++++-------- 3 files changed, 87 insertions(+), 24 deletions(-) diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index 75d599a..f315f63 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -415,6 +415,8 @@ struct mlx5_srq { int wqe_shift; int head; int tail; + int waitq_head; + int waitq_tail; __be32 *db; uint16_t counter; int wq_sig; @@ -807,7 +809,8 @@ int mlx5_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int mlx5_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr); int mlx5_destroy_srq(struct ibv_srq *srq); -int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq); +int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, + uint32_t nwr); void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind); int mlx5_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, @@ -1017,4 +1020,14 @@ static inline uint8_t calc_sig(void *wqe, int size) return ~res; } +static inline int align_queue_size(long long req) +{ + return mlx5_round_up_power_of_two(req); +} + +static inline bool srq_has_waitq(struct mlx5_srq *srq) +{ + return srq->waitq_head >= 0; +} + #endif /* MLX5_H */ diff --git a/providers/mlx5/srq.c b/providers/mlx5/srq.c index 94528bb..a2d37d0 100644 --- a/providers/mlx5/srq.c +++ b/providers/mlx5/srq.c @@ -145,13 +145,29 @@ int mlx5_post_srq_recv(struct ibv_srq *ibsrq, return err; } -int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq) +/* Build a linked list on an array of SRQ WQEs. + * Since WQEs are always added to the tail and taken from the head + * it doesn't matter where the last WQE points to. + */ +static void set_srq_buf_ll(struct mlx5_srq *srq, int start, int end) { struct mlx5_wqe_srq_next_seg *next; + int i; + + for (i = start; i < end; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = htobe16(i + 1); + } +} + +int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, + uint32_t max_wr) +{ int size; int buf_size; - int i; struct mlx5_context *ctx; + uint32_t orig_max_wr = max_wr; + bool have_wq = true; ctx = to_mctx(context); @@ -160,9 +176,18 @@ int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq) return -1; } - srq->wrid = malloc(srq->max * sizeof *srq->wrid); - if (!srq->wrid) - return -1; + /* At first, try to allocate more WQEs than requested so the extra will + * be used for the wait queue. + */ + max_wr = orig_max_wr * 2 + 1; + + if (max_wr > ctx->max_srq_recv_wr) { + /* Device limits are smaller than required + * to provide a wait queue, continue without. + */ + max_wr = orig_max_wr + 1; + have_wq = false; + } size = sizeof(struct mlx5_wqe_srq_next_seg) + srq->max_gs * sizeof(struct mlx5_wqe_data_seg); @@ -179,14 +204,28 @@ int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq) srq->wqe_shift = mlx5_ilog2(size); + srq->max = align_queue_size(max_wr); buf_size = srq->max * size; if (mlx5_alloc_buf(&srq->buf, buf_size, - to_mdev(context->device)->page_size)) { - free(srq->wrid); + to_mdev(context->device)->page_size)) return -1; + + srq->head = 0; + srq->tail = align_queue_size(orig_max_wr + 1) - 1; + if (have_wq) { + srq->waitq_head = srq->tail + 1; + srq->waitq_tail = srq->max - 1; + } else { + srq->waitq_head = -1; + srq->waitq_tail = -1; } + srq->wrid = malloc(srq->max * sizeof(*srq->wrid)); + if (!srq->wrid) { + mlx5_free_buf(&srq->buf); + return -1; + } memset(srq->buf.buf, 0, buf_size); /* @@ -194,13 +233,9 @@ int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq) * linked into the list of free WQEs. */ - for (i = 0; i < srq->max; ++i) { - next = get_wqe(srq, i); - next->next_wqe_index = htobe16((i + 1) & (srq->max - 1)); - } - - srq->head = 0; - srq->tail = srq->max - 1; + set_srq_buf_ll(srq, srq->head, srq->tail); + if (have_wq) + set_srq_buf_ll(srq, srq->waitq_head, srq->waitq_tail); return 0; } diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 7e1c125..2bccdf8 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -553,11 +553,6 @@ int mlx5_round_up_power_of_two(long long sz) return (int)ret; } -static int align_queue_size(long long req) -{ - return mlx5_round_up_power_of_two(req); -} - static int get_cqe_size(struct mlx5dv_cq_init_attr *mlx5cq_attr) { char *env; @@ -1016,11 +1011,10 @@ struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, goto err; } - srq->max = align_queue_size(attr->attr.max_wr + 1); srq->max_gs = attr->attr.max_sge; srq->counter = 0; - if (mlx5_alloc_srq_buf(pd->context, srq)) { + if (mlx5_alloc_srq_buf(pd->context, srq, attr->attr.max_wr)) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); goto err; } @@ -1041,11 +1035,22 @@ struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, attr->attr.max_sge = srq->max_gs; pthread_mutex_lock(&ctx->srq_table_mutex); + + /* Override max_wr to let kernel know about extra WQEs for the + * wait queue. + */ + attr->attr.max_wr = srq->max - 1; + ret = ibv_cmd_create_srq(pd, ibsrq, attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) goto err_db; + /* Override kernel response that includes the wait queue with the real + * number of WQEs that are applicable for the application. + */ + attr->attr.max_wr = srq->tail; + ret = mlx5_store_srq(ctx, resp.srqn, srq); if (ret) goto err_destroy; @@ -2707,11 +2712,10 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, goto err; } - msrq->max = align_queue_size(attr->attr.max_wr + 1); msrq->max_gs = attr->attr.max_sge; msrq->counter = 0; - if (mlx5_alloc_srq_buf(context, msrq)) { + if (mlx5_alloc_srq_buf(context, msrq, attr->attr.max_wr)) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); goto err; } @@ -2743,9 +2747,20 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, pthread_mutex_lock(&ctx->srq_table_mutex); } + /* Override max_wr to let kernel know about extra WQEs for the + * wait queue. + */ + attr->attr.max_wr = msrq->max - 1; + err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, sizeof(msrq->vsrq), attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + + /* Override kernel response that includes the wait queue with the real + * number of WQEs that are applicable for the application. + */ + attr->attr.max_wr = msrq->tail; + if (err) goto err_free_uidx; -- 1.8.3.1