This patch adds support for reporting flush completions. following is the overview of the algorithm used. Step-1: Poll a completion from h/w CQ. Step-2: check the status, if it is error goto step3 else report completion to user based on the con_idx reported. Step-3: Report the completion with actual error to consumer, and without bothering about the con_idx reported in the completion do following: 3a. Add this QP to the CQ flush list if it was not there already. If this is req-error, add the QP to send-flush list, else add it to recv-flush-list. 3b. Change QP-soft-state to ERROR if it was not in error already. Step-4: If next CQE is TERM CQE, extract this CQE. make sure this CQE is not reported to the consumer. Do the following steps as further processing: 4a. Add this QP to both send-flush-list and recv-flush-list if QP is absent from any of the flush lists. 4b. Change QP-soft-state to ERROR if it was not in error already. Step5: Continue polling from both h/w CQ and flush-lists until all the queues are empty. The QP is removed from the flush list during destroy-qp. Further, it adds network to host format conversion on the received immediate data. This patch also takes care of Hardware specific requirement to skip reporting h/w flush error CQEs to consumer but ring the CQ-DB for them. v1->v2 -- Used ccan/list.h instead. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@xxxxxxxxxxxx> Signed-off-by: Somnath Kotur <somnath.kotur@xxxxxxxxxxxx> Signed-off-by: Selvin Xavier <selvin.xavier@xxxxxxxxxxxx> Signed-off-by: Devesh Sharma <devesh.sharma@xxxxxxxxxxxx> --- providers/bnxt_re/flush.h | 85 ++++++++++++++++ providers/bnxt_re/main.c | 5 + providers/bnxt_re/main.h | 6 ++ providers/bnxt_re/memory.h | 5 + providers/bnxt_re/verbs.c | 243 ++++++++++++++++++++++++++++++++++++++++----- 5 files changed, 320 insertions(+), 24 deletions(-) create mode 100644 providers/bnxt_re/flush.h diff --git a/providers/bnxt_re/flush.h b/providers/bnxt_re/flush.h new file mode 100644 index 0000000..a39ea71 --- /dev/null +++ b/providers/bnxt_re/flush.h @@ -0,0 +1,85 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: A few wrappers for flush queue management + */ + +#ifndef __FLUSH_H__ +#define __FLUSH_H__ + +#include <ccan/list.h> + +struct bnxt_re_fque_node { + uint8_t valid; + struct list_node list; +}; + +static inline void fque_init_node(struct bnxt_re_fque_node *node) +{ + list_node_init(&node->list); + node->valid = false; +} + +static inline void fque_add_node_tail(struct list_head *head, + struct bnxt_re_fque_node *new) +{ + list_add_tail(head, &new->list); + new->valid = true; +} + +static inline void fque_del_node(struct bnxt_re_fque_node *entry) +{ + entry->valid = false; + list_del(&entry->list); +} + +static inline uint8_t _fque_node_valid(struct bnxt_re_fque_node *node) +{ + return node->valid; +} + +static inline void bnxt_re_fque_add_node(struct list_head *head, + struct bnxt_re_fque_node *node) +{ + if (!_fque_node_valid(node)) + fque_add_node_tail(head, node); +} + +static inline void bnxt_re_fque_del_node(struct bnxt_re_fque_node *node) +{ + if (_fque_node_valid(node)) + fque_del_node(node); +} +#endif /* __FLUSH_H__ */ diff --git a/providers/bnxt_re/main.c b/providers/bnxt_re/main.c index 60ad653..b33ea35 100644 --- a/providers/bnxt_re/main.c +++ b/providers/bnxt_re/main.c @@ -128,6 +128,7 @@ static int bnxt_re_init_context(struct verbs_device *vdev, dev->pg_size = resp.pg_size; dev->cqe_size = resp.cqe_size; dev->max_cq_depth = resp.max_cqd; + pthread_spin_init(&cntx->fqlock, PTHREAD_PROCESS_PRIVATE); ibvctx->ops = bnxt_re_cntx_ops; return 0; @@ -136,7 +137,11 @@ static int bnxt_re_init_context(struct verbs_device *vdev, static void bnxt_re_uninit_context(struct verbs_device *vdev, struct ibv_context *ibvctx) { + struct bnxt_re_context *cntx; + + cntx = to_bnxt_re_context(ibvctx); /* Unmap if anything device specific was mapped in init_context. */ + pthread_spin_destroy(&cntx->fqlock); } static struct verbs_device_ops bnxt_re_dev_ops = { diff --git a/providers/bnxt_re/main.h b/providers/bnxt_re/main.h index bfe7089..d324ef6 100644 --- a/providers/bnxt_re/main.h +++ b/providers/bnxt_re/main.h @@ -50,6 +50,7 @@ #include "bnxt_re-abi.h" #include "memory.h" +#include "flush.h" #define DEV "bnxtre : " @@ -69,6 +70,8 @@ struct bnxt_re_cq { uint32_t cqid; struct bnxt_re_queue cqq; struct bnxt_re_dpi *udpi; + struct list_head sfhead; + struct list_head rfhead; uint32_t cqe_size; uint8_t phase; }; @@ -104,6 +107,8 @@ struct bnxt_re_qp { struct bnxt_re_cq *rcq; struct bnxt_re_dpi *udpi; struct bnxt_re_qpcap cap; + struct bnxt_re_fque_node snode; + struct bnxt_re_fque_node rnode; uint32_t qpid; uint32_t tbl_indx; uint32_t sq_psn; @@ -133,6 +138,7 @@ struct bnxt_re_context { uint32_t max_qp; uint32_t max_srq; struct bnxt_re_dpi udpi; + pthread_spinlock_t fqlock; }; /* DB ring functions used internally*/ diff --git a/providers/bnxt_re/memory.h b/providers/bnxt_re/memory.h index f812eb8..debb31a 100644 --- a/providers/bnxt_re/memory.h +++ b/providers/bnxt_re/memory.h @@ -89,6 +89,11 @@ static inline uint32_t bnxt_re_is_que_full(struct bnxt_re_queue *que) return (((que->tail + 1) & (que->depth - 1)) == que->head); } +static inline uint32_t bnxt_re_is_que_empty(struct bnxt_re_queue *que) +{ + return que->tail == que->head; +} + static inline uint32_t bnxt_re_incr(uint32_t val, uint32_t max) { return (++val & (max - 1)); diff --git a/providers/bnxt_re/verbs.c b/providers/bnxt_re/verbs.c index 8230daf..1617af2 100644 --- a/providers/bnxt_re/verbs.c +++ b/providers/bnxt_re/verbs.c @@ -202,6 +202,9 @@ struct ibv_cq *bnxt_re_create_cq(struct ibv_context *ibvctx, int ncqe, cq->cqq.tail = resp.tail; cq->udpi = &cntx->udpi; + list_head_init(&cq->sfhead); + list_head_init(&cq->rfhead); + return &cq->ibvcq; cmdfail: bnxt_re_free_aligned(&cq->cqq); @@ -230,6 +233,47 @@ int bnxt_re_destroy_cq(struct ibv_cq *ibvcq) return 0; } +static uint8_t bnxt_re_poll_err_scqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, + struct bnxt_re_req_cqe *scqe, int *cnt) +{ + struct bnxt_re_queue *sq = qp->sqq; + struct bnxt_re_context *cntx; + struct bnxt_re_wrid *swrid; + struct bnxt_re_psns *spsn; + struct bnxt_re_cq *scq; + uint32_t head = sq->head; + uint8_t status; + + scq = to_bnxt_re_cq(qp->ibvqp.send_cq); + cntx = to_bnxt_re_context(scq->ibvcq.context); + swrid = &qp->swrid[head]; + spsn = swrid->psns; + + *cnt = 1; + status = (hdr->flg_st_typ_ph >> BNXT_RE_BCQE_STATUS_SHIFT) & + BNXT_RE_BCQE_STATUS_MASK; + ibvwc->status = bnxt_re_to_ibv_wc_status(status, true); + ibvwc->wc_flags = 0; + ibvwc->wr_id = swrid->wrid; + ibvwc->qp_num = qp->qpid; + ibvwc->opcode = (le32toh(spsn->opc_spsn) >> + BNXT_RE_PSNS_OPCD_SHIFT) & + BNXT_RE_PSNS_OPCD_MASK; + ibvwc->byte_len = 0; + + bnxt_re_incr_head(qp->sqq); + + if (qp->qpst != IBV_QPS_ERR) + qp->qpst = IBV_QPS_ERR; + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_add_node(&scq->sfhead, &qp->snode); + pthread_spin_unlock(&cntx->fqlock); + + return false; +} + static uint8_t bnxt_re_poll_success_scqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, struct bnxt_re_bcqe *hdr, @@ -284,21 +328,53 @@ static uint8_t bnxt_re_poll_scqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, status = (hdr->flg_st_typ_ph >> BNXT_RE_BCQE_STATUS_SHIFT) & BNXT_RE_BCQE_STATUS_MASK; - if (status == BNXT_RE_REQ_ST_OK) { + if (status == BNXT_RE_REQ_ST_OK) pcqe = bnxt_re_poll_success_scqe(qp, ibvwc, hdr, scqe, cnt); - } else { - /* TODO: Handle error completion properly. */ - fprintf(stderr, "%s(): swc with error, vendor status = %d\n", - __func__, status); - *cnt = 1; - ibvwc->status = bnxt_re_to_ibv_wc_status(status, true); - ibvwc->wr_id = qp->swrid[qp->sqq->head].wrid; - bnxt_re_incr_head(qp->sqq); - } + else + pcqe = bnxt_re_poll_err_scqe(qp, ibvwc, hdr, scqe, cnt); return pcqe; } +static int bnxt_re_poll_err_rcqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, + struct bnxt_re_rc_cqe *rcqe) +{ + struct bnxt_re_queue *rq = qp->rqq; + struct bnxt_re_wrid *rwrid; + struct bnxt_re_cq *rcq; + struct bnxt_re_context *cntx; + uint32_t head = rq->head; + uint8_t status; + + rcq = to_bnxt_re_cq(qp->ibvqp.recv_cq); + cntx = to_bnxt_re_context(rcq->ibvcq.context); + + rwrid = &qp->rwrid[head]; + status = (hdr->flg_st_typ_ph >> BNXT_RE_BCQE_STATUS_SHIFT) & + BNXT_RE_BCQE_STATUS_MASK; + /* skip h/w flush errors */ + if (status == BNXT_RE_RSP_ST_HW_FLUSH) + return 0; + ibvwc->status = bnxt_re_to_ibv_wc_status(status, false); + /* TODO: Add SRQ Processing here */ + if (qp->rqq) { + ibvwc->wr_id = rwrid->wrid; + ibvwc->qp_num = qp->qpid; + ibvwc->opcode = IBV_WC_RECV; + ibvwc->byte_len = 0; + bnxt_re_incr_head(qp->rqq); + if (qp->qpst != IBV_QPS_ERR) + qp->qpst = IBV_QPS_ERR; + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_add_node(&rcq->rfhead, &qp->rnode); + pthread_spin_unlock(&cntx->fqlock); + } + + return 1; +} + static void bnxt_re_poll_success_rcqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, struct bnxt_re_bcqe *hdr, @@ -346,18 +422,37 @@ static uint8_t bnxt_re_poll_rcqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, status = (hdr->flg_st_typ_ph >> BNXT_RE_BCQE_STATUS_SHIFT) & BNXT_RE_BCQE_STATUS_MASK; - if (status == BNXT_RE_RSP_ST_OK) { + *cnt = 1; + if (status == BNXT_RE_RSP_ST_OK) bnxt_re_poll_success_rcqe(qp, ibvwc, hdr, rcqe); - *cnt = 1; - } else { - /* TODO: Process error completions properly.*/ - *cnt = 1; - ibvwc->status = bnxt_re_to_ibv_wc_status(status, false); - if (qp->rqq) { - ibvwc->wr_id = qp->rwrid[qp->rqq->head].wrid; - bnxt_re_incr_head(qp->rqq); - } - } + else + *cnt = bnxt_re_poll_err_rcqe(qp, ibvwc, hdr, rcqe); + + return pcqe; +} + +static uint8_t bnxt_re_poll_term_cqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, void *cqe, int *cnt) +{ + struct bnxt_re_context *cntx; + struct bnxt_re_cq *scq, *rcq; + uint8_t pcqe = false; + + scq = to_bnxt_re_cq(qp->ibvqp.send_cq); + rcq = to_bnxt_re_cq(qp->ibvqp.recv_cq); + cntx = to_bnxt_re_context(scq->ibvcq.context); + /* For now just add the QP to flush list without + * considering the index reported in the CQE. + * Continue reporting flush completions until the + * SQ and RQ are empty. + */ + *cnt = 0; + if (qp->qpst != IBV_QPS_ERR) + qp->qpst = IBV_QPS_ERR; + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_add_node(&rcq->rfhead, &qp->rnode); + bnxt_re_fque_add_node(&scq->sfhead, &qp->snode); + pthread_spin_unlock(&cntx->fqlock); return pcqe; } @@ -410,6 +505,12 @@ static int bnxt_re_poll_one(struct bnxt_re_cq *cq, int nwc, struct ibv_wc *wc) case BNXT_RE_WC_TYPE_RECV_RAW: break; case BNXT_RE_WC_TYPE_TERM: + scqe = cqe; + qp_handle = (uint64_t *)&scqe->qp_handle; + qp = (struct bnxt_re_qp *)(uintptr_t)scqe->qp_handle; + if (!qp) + break; + pcqe = bnxt_re_poll_term_cqe(qp, wc, cqe, &cnt); break; case BNXT_RE_WC_TYPE_COFF: break; @@ -442,22 +543,107 @@ bail: return dqed; } +static int bnxt_re_poll_flush_wcs(struct bnxt_re_queue *que, + struct bnxt_re_wrid *wridp, + struct ibv_wc *ibvwc, uint32_t qpid, + int nwc) +{ + struct bnxt_re_wrid *wrid; + struct bnxt_re_psns *psns; + uint32_t cnt = 0, head; + uint8_t opcode = IBV_WC_RECV; + + while (nwc) { + if (bnxt_re_is_que_empty(que)) + break; + head = que->head; + wrid = &wridp[head]; + if (wrid->psns) { + psns = wrid->psns; + opcode = (psns->opc_spsn >> BNXT_RE_PSNS_OPCD_SHIFT) & + BNXT_RE_PSNS_OPCD_MASK; + } + + ibvwc->status = IBV_WC_WR_FLUSH_ERR; + ibvwc->opcode = opcode; + ibvwc->wr_id = wrid->wrid; + ibvwc->qp_num = qpid; + ibvwc->byte_len = 0; + ibvwc->wc_flags = 0; + + bnxt_re_incr_head(que); + nwc--; + cnt++; + ibvwc++; + } + + return cnt; +} + +static int bnxt_re_poll_flush_lists(struct bnxt_re_cq *cq, uint32_t nwc, + struct ibv_wc *ibvwc) +{ + struct bnxt_re_fque_node *cur, *tmp; + struct bnxt_re_qp *qp; + struct bnxt_re_queue *que; + int dqed = 0, left; + + /* Check if flush Qs are empty */ + if (list_empty(&cq->sfhead) && list_empty(&cq->rfhead)) + return 0; + + if (!list_empty(&cq->sfhead)) { + list_for_each_safe(&cq->sfhead, cur, tmp, list) { + qp = container_of(cur, struct bnxt_re_qp, snode); + que = qp->sqq; + if (bnxt_re_is_que_empty(que)) + continue; + dqed = bnxt_re_poll_flush_wcs(que, qp->swrid, ibvwc, + qp->qpid, nwc); + } + } + + left = nwc - dqed; + if (!left) + return dqed; + + if (!list_empty(&cq->rfhead)) { + list_for_each_safe(&cq->rfhead, cur, tmp, list) { + qp = container_of(cur, struct bnxt_re_qp, rnode); + que = qp->rqq; + if (!que || bnxt_re_is_que_empty(que)) + continue; + dqed += bnxt_re_poll_flush_wcs(que, qp->rwrid, + ibvwc + dqed, qp->qpid, + left); + } + } + + return dqed; +} + int bnxt_re_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) { struct bnxt_re_cq *cq = to_bnxt_re_cq(ibvcq); - int dqed; + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvcq->context); + int dqed, left = 0; pthread_spin_lock(&cq->cqq.qlock); dqed = bnxt_re_poll_one(cq, nwc, wc); pthread_spin_unlock(&cq->cqq.qlock); - - /* TODO: Flush Management*/ + /* Check if anything is there to flush. */ + pthread_spin_lock(&cntx->fqlock); + left = nwc - dqed; + if (left) + dqed += bnxt_re_poll_flush_lists(cq, left, (wc + dqed)); + pthread_spin_unlock(&cntx->fqlock); return dqed; } static void bnxt_re_cleanup_cq(struct bnxt_re_qp *qp, struct bnxt_re_cq *cq) { + struct bnxt_re_context *cntx; struct bnxt_re_queue *que = &cq->cqq; struct bnxt_re_bcqe *hdr; struct bnxt_re_req_cqe *scqe; @@ -465,6 +651,8 @@ static void bnxt_re_cleanup_cq(struct bnxt_re_qp *qp, struct bnxt_re_cq *cq) void *cqe; int indx, type; + cntx = to_bnxt_re_context(cq->ibvcq.context); + pthread_spin_lock(&que->qlock); for (indx = 0; indx < que->depth; indx++) { cqe = que->va + indx * bnxt_re_get_cqe_sz(); @@ -487,6 +675,11 @@ static void bnxt_re_cleanup_cq(struct bnxt_re_qp *qp, struct bnxt_re_cq *cq) } pthread_spin_unlock(&que->qlock); + + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_del_node(&qp->snode); + bnxt_re_fque_del_node(&qp->rnode); + pthread_spin_unlock(&cntx->fqlock); } void bnxt_re_cq_event(struct ibv_cq *ibvcq) @@ -679,6 +872,8 @@ struct ibv_qp *bnxt_re_create_qp(struct ibv_pd *ibvpd, cap->max_rsge = attr->cap.max_recv_sge; cap->max_inline = attr->cap.max_inline_data; cap->sqsig = attr->sq_sig_all; + fque_init_node(&qp->snode); + fque_init_node(&qp->rnode); return &qp->ibvqp; failcmd: -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html