[PATCH rdma-core 4/6] mlx5: Handle ODP fault completion in SRQ

Yishai Hadas <yishaih@xxxxxxxxxxxx> · Wed, 20 Feb 2019 16:57:36 +0200

From: Moni Shoua <monis@xxxxxxxxxxxx>

A SRQ WQE that has ODP buffers might be completed with error and a
special syndrome. This indicates that the HW couldn't scatter the data
to the WQE buffers on one hand but needed to consume the WQE anyway.
This type of error shouldn't be raised to the poller of the CQ but
handled inside the driver.

The WQE for which the completions arrived for need to be re-posted to
keep the integrity of the SRQ from the application point of view. The
rep-posted WQE is taken from the SRQ head which means that the completed
WQE becomes free. To prevent it from being posted again with different
addresses and thus interfere the page-fault handler in the kernel, it is
recommended to put this WQE in the wait queue for cooling down.

Signed-off-by: Moni Shoua <monis@xxxxxxxxxxxx>
Reviewed-by: Artemy Kovalyov <artemyko@xxxxxxxxxxxx>
Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxxxx>
---
 providers/mlx5/cq.c     | 33 +++++++++++++++---
 providers/mlx5/mlx5.h   |  3 ++
 providers/mlx5/mlx5dv.h |  4 +++
 providers/mlx5/srq.c    | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/providers/mlx5/cq.c b/providers/mlx5/cq.c
index 6f5c9f1..b9b47df 100644
--- a/providers/mlx5/cq.c
+++ b/providers/mlx5/cq.c
@@ -49,7 +49,8 @@
 enum {
 	CQ_OK					=  0,
 	CQ_EMPTY				= -1,
-	CQ_POLL_ERR				= -2
+	CQ_POLL_ERR				= -2,
+	CQ_POLL_NODATA				= ENOENT
 };
 
 enum {
@@ -659,6 +660,12 @@ static int handle_tag_matching(struct mlx5_cq *cq,
 	return CQ_OK;
 }
 
+static inline int is_odp_pfault_err(struct mlx5_err_cqe *ecqe)
+{
+	return ecqe->syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR &&
+	       ecqe->vendor_err_synd == MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT;
+}
+
 static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
 				 struct mlx5_cqe64 *cqe64,
 				 void *cqe,
@@ -682,10 +689,14 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
 	int idx;
 	uint8_t opcode;
 	struct mlx5_err_cqe *ecqe;
-	int err = 0;
+	int err;
 	struct mlx5_qp *mqp;
 	struct mlx5_context *mctx;
-	uint8_t is_srq = 0;
+	uint8_t is_srq;
+
+again:
+	is_srq = 0;
+	err = 0;
 
 	mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context);
 	qpn = be32toh(cqe64->sop_drop_qpn) & 0xffffff;
@@ -811,7 +822,8 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
 			wc->vendor_err = ecqe->vendor_err_synd;
 
 		if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR &&
-			     ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR)) {
+			     ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR &&
+			     !is_odp_pfault_err(ecqe))) {
 			FILE *fp = mctx->dbg_fp;
 			fprintf(fp, PFX "%s: got completion with error:\n",
 				mctx->hostname);
@@ -844,6 +856,17 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
 
 			if (is_srq) {
 				wqe_ctr = be16toh(cqe64->wqe_counter);
+				if (is_odp_pfault_err(ecqe)) {
+					mlx5_complete_odp_fault(*cur_srq, wqe_ctr);
+					err = mlx5_get_next_cqe(cq, &cqe64, &cqe);
+					/* CQ_POLL_NODATA indicates that CQ was not empty but the polled CQE
+					 * was handled internally and should not processed by the caller.
+					 */
+					if (err == CQ_EMPTY)
+						return CQ_POLL_NODATA;
+					goto again;
+				}
+
 				if (lazy)
 					cq->ibv_cq.wr_id = (*cur_srq)->wrid[wqe_ctr];
 				else
@@ -1060,7 +1083,7 @@ static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_att
 	if (lock && err)
 		mlx5_spin_unlock(&cq->lock);
 
-	if (stall && err) {
+	if (stall && err == CQ_POLL_ERR) {
 		if (stall == POLLING_MODE_STALL_ADAPTIVE) {
 			cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step,
 						mlx5_stall_cq_poll_min);
diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h
index f315f63..9129c0f 100644
--- a/providers/mlx5/mlx5.h
+++ b/providers/mlx5/mlx5.h
@@ -811,6 +811,7 @@ int mlx5_query_srq(struct ibv_srq *srq,
 int mlx5_destroy_srq(struct ibv_srq *srq);
 int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq,
 		       uint32_t nwr);
+void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind);
 void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind);
 int mlx5_post_srq_recv(struct ibv_srq *ibsrq,
 		       struct ibv_recv_wr *wr,
@@ -1030,4 +1031,6 @@ static inline bool srq_has_waitq(struct mlx5_srq *srq)
 	return srq->waitq_head >= 0;
 }
 
+bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind);
+
 #endif /* MLX5_H */
diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h
index 796ea7b..e2788d8 100644
--- a/providers/mlx5/mlx5dv.h
+++ b/providers/mlx5/mlx5dv.h
@@ -512,6 +512,10 @@ enum {
 };
 
 enum {
+	MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT		= 0x93,
+};
+
+enum {
 	MLX5_CQE_L2_OK = 1 << 0,
 	MLX5_CQE_L3_OK = 1 << 1,
 	MLX5_CQE_L4_OK = 1 << 2,
diff --git a/providers/mlx5/srq.c b/providers/mlx5/srq.c
index a2d37d0..71d74a7 100644
--- a/providers/mlx5/srq.c
+++ b/providers/mlx5/srq.c
@@ -82,6 +82,95 @@ void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind)
 	mlx5_spin_unlock(&srq->lock);
 }
 
+/* Take an index and put it last in wait queue */
+static void srq_put_in_waitq(struct mlx5_srq *srq, int ind)
+{
+	struct mlx5_wqe_srq_next_seg *waitq_tail;
+
+	waitq_tail = get_wqe(srq, srq->waitq_tail);
+	waitq_tail->next_wqe_index = htobe16(ind);
+	srq->waitq_tail = ind;
+}
+
+/* Take first in wait queue and put in tail of SRQ */
+static void srq_get_from_waitq(struct mlx5_srq *srq)
+{
+	struct mlx5_wqe_srq_next_seg *tail;
+	struct mlx5_wqe_srq_next_seg *waitq_head;
+
+	tail = get_wqe(srq, srq->tail);
+	waitq_head = get_wqe(srq, srq->waitq_head);
+
+	tail->next_wqe_index = htobe16(srq->waitq_head);
+	srq->tail = srq->waitq_head;
+	srq->waitq_head = be16toh(waitq_head->next_wqe_index);
+}
+
+/* Put the given WQE that is in SW ownership at the end of the wait queue.
+ * Take a WQE from the wait queue and add it to WQEs in SW ownership instead.
+ */
+bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind)
+{
+	if (!srq_has_waitq(srq))
+		return false;
+
+	srq_put_in_waitq(srq, ind);
+	srq_get_from_waitq(srq);
+	return true;
+}
+
+/* Post a WQE internally, based on a previous application post.
+ * Copy a given WQE's data segments to the SRQ head, advance the head
+ * and ring the HW doorbell.
+ */
+static void srq_repost(struct mlx5_srq *srq, int ind)
+{
+	struct mlx5_wqe_srq_next_seg *src, *dst;
+	struct mlx5_wqe_data_seg *src_scat, *dst_scat;
+	int i;
+
+	srq->wrid[srq->head] = srq->wrid[ind];
+
+	src = get_wqe(srq, ind);
+	dst = get_wqe(srq, srq->head);
+	src_scat = (struct mlx5_wqe_data_seg *)(src + 1);
+	dst_scat = (struct mlx5_wqe_data_seg *)(dst + 1);
+
+	for (i = 0; i < srq->max_gs; ++i) {
+		dst_scat[i] = src_scat[i];
+
+		if (dst_scat[i].lkey == htobe32(MLX5_INVALID_LKEY))
+			break;
+	}
+
+	srq->head = be16toh(dst->next_wqe_index);
+	srq->counter++;
+	/* Flush descriptors */
+	udma_to_device_barrier();
+	*srq->db = htobe32(srq->counter);
+}
+
+void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind)
+{
+	mlx5_spin_lock(&srq->lock);
+
+	if (!srq_cooldown_wqe(srq, ind)) {
+		struct mlx5_wqe_srq_next_seg *tail = get_wqe(srq, srq->tail);
+
+		/* Without a wait queue put the page-faulted wqe
+		 * back in SRQ tail. The repost is still possible but
+		 * the risk of overriding the page-faulted WQE with a future
+		 * post_srq_recv() is now higher.
+		 */
+		tail->next_wqe_index = htobe16(ind);
+		srq->tail = ind;
+	}
+
+	srq_repost(srq, ind);
+
+	mlx5_spin_unlock(&srq->lock);
+}
+
 int mlx5_post_srq_recv(struct ibv_srq *ibsrq,
 		       struct ibv_recv_wr *wr,
 		       struct ibv_recv_wr **bad_wr)
-- 
1.8.3.1