From: Kaike Wan <kaike.wan@xxxxxxxxx> This patch adds the capability to integrate TID RDMA WRITE requests into the normal RDMA verbs framework. The TID RDMA WRITE protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA WRITE request into a TID RDMA WRITE request to avoid data copying on the responder side. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx> Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/qp.c | 10 + drivers/infiniband/hw/hfi1/qp.h | 4 drivers/infiniband/hw/hfi1/rc.c | 504 ++++++++++++++++++++++++++++--- drivers/infiniband/hw/hfi1/ruc.c | 4 drivers/infiniband/hw/hfi1/sdma_txreq.h | 3 drivers/infiniband/hw/hfi1/tid_rdma.c | 45 +++ drivers/infiniband/hw/hfi1/tid_rdma.h | 19 + drivers/infiniband/hw/hfi1/user_sdma.c | 4 drivers/infiniband/hw/hfi1/verbs.c | 93 ++++-- drivers/infiniband/hw/hfi1/verbs.h | 28 ++ drivers/infiniband/sw/rdmavt/qp.c | 2 include/rdma/rdmavt_qp.h | 1 12 files changed, 631 insertions(+), 86 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 0be3def..c762a98 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -140,6 +140,12 @@ static int iowait_sleep( .flags = RVT_OPERATION_USE_RESERVE, }, +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + }; static void flush_list_head(struct list_head *l) @@ -748,6 +754,7 @@ void flush_qp_waiters(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); flush_iowait(qp); + tid_rdma_flush_wait(qp); } void stop_send_queue(struct rvt_qp *qp) @@ -763,6 +770,8 @@ void quiesce_qp(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); @@ -770,6 +779,7 @@ void quiesce_qp(struct rvt_qp *qp) void notify_qp_reset(struct rvt_qp *qp) { + hfi1_qp_kern_exp_rcv_clear_all(qp); qp->r_adefered = 0; clear_ahg(qp); diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 3cb1041..b670321 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -64,12 +64,16 @@ * HFI1_S_AHG_CLEAR - have send engine clear ahg state * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource + * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response + * HFI1_S_WAIT_HALT - halt the first leg send engine * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 #define HFI1_S_AHG_CLEAR 0x40000000 #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 #define HFI1_S_WAIT_TID_SPACE 0x10000000 +#define HFI1_S_WAIT_TID_RESP 0x08000000 +#define HFI1_S_WAIT_HALT 0x04000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 3bbf0f4..7bc635b 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -111,13 +111,14 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct rvt_ack_entry *e; - u32 hwords; + u32 hwords, hdrlen; u32 len = 0; u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; - struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; u8 next = qp->s_tail_ack_queue; bool last_pkt; u32 delta; @@ -127,7 +128,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - if (priv->hdr_type == HFI1_PKT_TYPE_9B) + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; else @@ -205,6 +206,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; } else if (e->opcode == TID_OP(READ_REQ)) { /* * If a TID RDMA read response is being resent and @@ -240,7 +256,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(e->psn); e->sent = 1; } - bth0 = qp->s_ack_state << 24; + if (!bth0) + bth0 = qp->s_ack_state << 24; break; case OP(RDMA_READ_RESPONSE_FIRST): @@ -266,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + break; + case TID_OP(READ_RESP): read_resp: e = &qp->s_ack_queue[qp->s_tail_ack_queue]; @@ -290,6 +360,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, default: normal: + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; /* * Send a regular ACK. * Set the s_ack_state so we wait until after sending @@ -297,22 +368,24 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~RVT_S_ACK_PENDING; - ps->s_txreq->ss = NULL; +normal_no_state: if (qp->s_nak_state) ohdr->u.aeth = - cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + cpu_to_be32((qp->r_msn & + IB_MSN_MASK) | (qp->s_nak_state << IB_AETH_CREDIT_SHIFT)); else ohdr->u.aeth = rvt_compute_aeth(qp); hwords++; - len = 0; bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); + len = 0; + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; - ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->sde = qpriv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); @@ -355,6 +428,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct rvt_swqe *wqe; struct hfi1_swqe_priv *wpriv; struct tid_rdma_request *req = NULL; + struct tid_rdma_params *remote; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ u32 hwords = 5; u32 len = 0; @@ -412,7 +486,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto done_free_tx; } - if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { @@ -575,6 +649,108 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_cur = 0; break; + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_RDMA_READ: /* * Don't allow more operations to be started @@ -729,7 +905,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_psn = wqe->lpsn + 1; else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_psn = req->s_next_psn; @@ -850,6 +1027,32 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_cur = 0; break; + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case TID_OP(READ_RESP): if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) goto bail; @@ -946,7 +1149,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); - if (delta && delta % HFI1_PSN_CREDIT == 0) + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) bth2 |= IB_BTH_REQ_ACK; if (qp->s_flags & RVT_S_SEND_ONE) { qp->s_flags &= ~RVT_S_SEND_ONE; @@ -979,6 +1183,14 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) bail_no_tx: ps->s_txreq = NULL; qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again, set the flags to the the wake up which work item to wake + * up. + * (A better algorithm should be found to do this and generalize the + * sleep/wakeup flags.) + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); return 0; } @@ -1266,6 +1478,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) lockdep_assert_held(&qp->s_lock); qp->s_cur = n; priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; qp->s_num_rd_atomic = 0; /* @@ -1323,6 +1536,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_LAST); break; + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + case IB_WR_RDMA_READ: qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; @@ -1413,7 +1630,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | - RVT_S_WAIT_ACK); + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); if (wait) qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); @@ -1421,9 +1638,10 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) /* * Set qp->s_sending_psn to the next PSN after the given one. - * This would be psn+1 except when RDMA reads are present. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. */ -static void reset_sending_psn(struct rvt_qp *qp, u32 psn) +static void reset_sending_psn(struct rvt_qp *qp, u32 psn, u32 opcode) { struct rvt_swqe *wqe; u32 n = qp->s_last; @@ -1434,7 +1652,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || - wqe->wr.opcode == IB_WR_TID_RDMA_READ) + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1457,8 +1676,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct rvt_swqe *wqe; struct ib_header *hdr = NULL; struct hfi1_16b_header *hdr_16b = NULL; - u32 opcode; + u32 opcode, head, tail; u32 psn; + struct tid_rdma_request *req; lockdep_assert_held(&qp->s_lock); if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) @@ -1485,6 +1705,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) opcode = ib_bth_get_opcode(ohdr); if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(WRITE_RESP) || opcode == TID_OP(READ_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; @@ -1492,22 +1713,72 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) } psn = ib_bth_get_psn(ohdr); - reset_sending_psn(qp, psn); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn, opcode); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) + tail -= 1; + } else { + head = qp->s_tail; + tail = qp->s_acked; + } /* * Start timer after a packet requesting an ACK has been sent and * there are still requests that haven't been acked. */ - if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && !(qp->s_flags & - (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { if (opcode == TID_OP(READ_REQ)) rvt_add_retry_timer_ext(qp, priv->timeout_shift); else rvt_add_retry_timer(qp); } + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } + while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1515,6 +1786,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; + rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; trace_hfi1_qp_send_completion(qp, wqe, s_last); if (++s_last >= qp->s_size) @@ -1567,6 +1839,7 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, u32 s_last; rvt_put_swqe(wqe); + rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; trace_hfi1_qp_send_completion(qp, wqe, s_last); if (++s_last >= qp->s_size) @@ -1599,7 +1872,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, } qp->s_retry = qp->s_retry_cnt; - update_last_psn(qp, wqe->lpsn); + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); /* * If we are completing a request which is in the process of @@ -1625,6 +1907,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, return wqe; } +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -1636,8 +1966,8 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * May be called at interrupt level, with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, - u64 val, struct hfi1_ctxtdata *rcd) +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, + struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp; enum ib_wc_status status; @@ -1646,6 +1976,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, int ret = 0; u32 ack_psn; int diff; + struct rvt_dev_info *rdi; lockdep_assert_held(&qp->s_lock); /* @@ -1692,18 +2023,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { - /* Retry this request. */ - if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { - qp->r_flags |= RVT_R_RDMAR_SEQ; - hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_SEND; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, - &rcd->qp_wait_list); - } - } + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -1735,6 +2058,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, hfi1_schedule_send(qp); } } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) break; @@ -1750,17 +2081,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, else rvt_stop_rc_timers(qp); } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + /* - * We are expecting more ACKs so - * mod the retry timer. - */ - rvt_mod_retry_timer(qp); - /* - * We can stop re-sending the earlier packets and - * continue with the next packet the receiver wants. + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. */ - if (cmp_psn(qp->s_psn, psn) <= 0) - reset_psn(qp, psn + 1); + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } } else { /* No more acks - kill all timers */ rvt_stop_rc_timers(qp); @@ -1773,9 +2147,20 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, qp->s_flags &= ~RVT_S_WAIT_ACK; hfi1_schedule_send(qp); } + rvt_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; + update_last_psn(qp, psn); return 1; @@ -1785,20 +2170,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, goto bail_stop; if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; - if (qp->s_rnr_retry == 0) { + rdi = ib_to_rvt(qp->ibqp.device); + if (qp->s_rnr_retry == 0 && + !((rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT) && + qp->s_rnr_retry_cnt == 0)) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } - if (qp->s_rnr_retry_cnt < 7) + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) qp->s_rnr_retry--; - /* The last valid PSN is the previous PSN. */ - update_last_psn(qp, psn - 1); + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - - reset_psn(qp, psn); - qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); rvt_stop_rc_timers(qp); rvt_add_rnr_timer(qp, aeth); @@ -1883,6 +2279,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; @@ -2375,6 +2772,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct hfi1_qp_priv *qpriv = qp->priv; struct ib_other_headers *ohdr = packet->ohdr; u32 opcode = packet->opcode; u32 hdrsize = packet->hlen; @@ -2682,6 +3080,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -2755,6 +3154,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 081a7cf..fe8261e 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -785,6 +785,7 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, * @qp: a pointer to QP * @ps: a pointer to a structure with commonly lookup values for * the the send engine progress + * @tid - true if the tid leg * * This routine checks if the time slice for the QP has expired * for RC QPs, if so an additional work entry is queued. At this @@ -792,7 +793,7 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, * returns true if a yield is required, otherwise, false * is returned. */ -bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, bool tid) { ps->pkts_sent = true; @@ -921,6 +922,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (priv->s_flags & HFI1_S_TID_BUSY_SET) qp->s_flags |= RVT_S_BUSY; spin_unlock_irqrestore(&qp->s_lock, ps.flags); + /* * If the packet cannot be sent now, return and * the send engine will be woken up later. diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h index bf7d777..6a06242 100644 --- a/drivers/infiniband/hw/hfi1/sdma_txreq.h +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -91,6 +91,7 @@ struct sdma_desc { #define SDMA_TXREQ_F_URGENT 0x0001 #define SDMA_TXREQ_F_AHG_COPY 0x0002 #define SDMA_TXREQ_F_USE_AHG 0x0004 +#define SDMA_TXREQ_F_SGE_CORRUPT 0x0008 struct sdma_txreq; typedef void (*callback_t)(struct sdma_txreq *, int); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 50dc4af..8887025 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -70,6 +70,20 @@ * -- Each local scatter-gather entry should be a multiple of 4K page size; */ +/** + * DOC: TID RDMA WRITE protocol + * + * This is an end-to-end protocol at the hfi1 level between two nodes that + * improves performance by avoiding data copy on the responder side. It + * converts a qualified RDMA WRITE request into a TID RDMA WRITE request on + * the requester side and thereafter handles the request and response + * differently. To be qualified, the RDMA WRITE request should meet the + * following: + * -- The total data length should be greater than 256K; + * -- The total data length should be a multiple of 4K page size; + * -- The remote address should be 4K page aligned; + */ + #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) @@ -193,6 +207,18 @@ static int hfi1_send_tid_ok(struct rvt_qp *qp) !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); } +void hfi1_del_tid_reap_timer(struct rvt_qp *qp) +{ +} + +void hfi1_add_tid_retry_timer(struct rvt_qp *qp) +{ +} + +void hfi1_del_tid_retry_timer(struct rvt_qp *qp) +{ +} + static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { return @@ -2257,6 +2283,10 @@ void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) } } +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) +{ +} + u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, void *context, int vl, int mode, u64 data) { @@ -2408,6 +2438,21 @@ static void hfi1_do_tid_send(struct rvt_qp *qp) spin_unlock_irqrestore(&qp->s_lock, ps.flags); } +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + return 0; +} + +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss) +{ + return 0; +} + u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 77b0ea9..89384fb 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -228,6 +228,12 @@ struct tid_rdma_flow { u8 resync_npkts; }; +enum tid_rnr_nak_state { + TID_RNR_NAK_INIT = 0, + TID_RNR_NAK_SEND, + TID_RNR_NAK_SENT, +}; + /** * kern_tid_node - used for managing TID's in TID groups * @@ -285,11 +291,17 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, struct hfi1_packet *packet); +void hfi1_del_tid_reap_timer(struct rvt_qp *qp); + +void hfi1_add_tid_retry_timer(struct rvt_qp *qp); +void hfi1_del_tid_retry_timer(struct rvt_qp *qp); + bool hfi1_schedule_tid_send(struct rvt_qp *qp); int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 *bth2); @@ -297,6 +309,13 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, void _hfi1_do_tid_send(struct work_struct *work); void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss); u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 3936690..af859a4 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -64,7 +64,6 @@ #include "hfi.h" #include "sdma.h" -#include "mmu_rb.h" #include "user_sdma.h" #include "verbs.h" /* for the headers */ #include "common.h" /* for struct hfi1_tid_info */ @@ -1130,7 +1129,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 0xffffffull), psn = val & mask; if (expct) - psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | + ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); else psn = psn + frags; return psn & mask; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 028c057..8bebf6b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -311,6 +311,7 @@ static inline bool wss_exceeds_threshold(void) */ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, @@ -555,7 +556,7 @@ static inline opcode_handler qp_ok(struct hfi1_packet *packet) static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) { #ifdef CONFIG_FAULT_INJECTION - if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) + if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) { /* * In order to drop non-IB traffic we * set PbcInsertHrc to NONE (0x2). @@ -566,8 +567,9 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) * packet will not be delivered to the * correct context. */ + pbc &= ~PBC_INSERT_HCRC_SMASK; pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT; - else + } else { /* * In order to drop regular verbs * traffic we set the PbcTestEbp @@ -577,6 +579,7 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) * triggered and will be dropped. */ pbc |= PBC_TEST_EBP; + } #endif return pbc; } @@ -909,46 +912,54 @@ static int wait_kmem(struct hfi1_ibdev *dev, return ret; } +static noinline int handle_corrupted_sge(struct sdma_engine *sde, + struct verbs_txreq *tx) +{ + tx->txreq.flags |= SDMA_TXREQ_F_SGE_CORRUPT; + return -EINVAL; +} + /* * This routine calls txadds for each sg entry. * * Add failures will revert the sge cursor */ -static noinline int build_verbs_ulp_payload( - struct sdma_engine *sde, - u32 length, - struct verbs_txreq *tx) +static noinline int build_verbs_ulp_payload(struct sdma_engine *sde, + u32 length, + struct verbs_txreq *tx) { - struct rvt_sge_state *ss = tx->ss; - struct rvt_sge *sg_list = ss->sg_list; - struct rvt_sge sge = ss->sge; - u8 num_sge = ss->num_sge; + struct rvt_sge *sg_list = tx->ss->sg_list; + struct rvt_sge sge = tx->ss->sge; + u8 num_sge = tx->ss->num_sge; u32 len; int ret = 0; while (length) { - len = ss->sge.length; + len = tx->ss->sge.length; if (len > length) len = length; - if (len > ss->sge.sge_length) - len = ss->sge.sge_length; - WARN_ON_ONCE(len == 0); + if (len > tx->ss->sge.sge_length) + len = tx->ss->sge.sge_length; + if (WARN_ON_ONCE(len == 0)) { + ret = handle_corrupted_sge(sde, tx); + goto bail_txadd; + } ret = sdma_txadd_kvaddr( sde->dd, &tx->txreq, - ss->sge.vaddr, + tx->ss->sge.vaddr, len); if (ret) goto bail_txadd; - rvt_update_sge(ss, len, false); + rvt_update_sge(tx->ss, len, false); length -= len; } return ret; bail_txadd: /* unwind cursor */ - ss->sge = sge; - ss->num_sge = num_sge; - ss->sg_list = sg_list; + tx->ss->sge = sge; + tx->ss->num_sge = num_sge; + tx->ss->sg_list = sg_list; return ret; } @@ -1097,6 +1108,14 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, qp->srate_mbps, vl, plen); + + /* Update HCRC based on packet opcode */ + if ((ps->opcode & IB_OPCODE_TID_RDMA) == + IB_OPCODE_TID_RDMA) { + pbc &= ~PBC_INSERT_HCRC_SMASK; + pbc |= (u64)PBC_IHCRC_LKDETH << + PBC_INSERT_HCRC_SHIFT; + } } tx->wqe = qp->s_wqe; ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc); @@ -1119,8 +1138,11 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, /* The current one got "sent" */ return 0; bail_build: + if (unlikely(tx->txreq.flags & SDMA_TXREQ_F_SGE_CORRUPT)) + goto put_txreq; ret = wait_kmem(dev, qp, ps); if (!ret) { +put_txreq: /* free txreq - bad state */ hfi1_put_txreq(ps->s_txreq); ps->s_txreq = NULL; @@ -1193,7 +1215,6 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, { struct hfi1_qp_priv *priv = qp->priv; u32 hdrwords = ps->s_txreq->hdr_dwords; - struct rvt_sge_state *ss = ps->s_txreq->ss; u32 len = ps->s_txreq->s_cur_size; u32 dwords; u32 plen; @@ -1246,6 +1267,12 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); + + /* Update HCRC based on packet opcode */ + if ((ps->opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) { + pbc &= ~PBC_INSERT_HCRC_SMASK; + pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT; + } } if (cb) iowait_pio_inc(&priv->s_iowait); @@ -1286,14 +1313,14 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, } else { seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4); - if (ss) { + if (ps->s_txreq->ss) { while (len) { - void *addr = ss->sge.vaddr; - u32 slen = ss->sge.length; + void *addr = ps->s_txreq->ss->sge.vaddr; + u32 slen = ps->s_txreq->ss->sge.length; if (slen > len) slen = len; - rvt_update_sge(ss, slen, false); + rvt_update_sge(ps->s_txreq->ss, slen, false); seg_pio_copy_mid(pbuf, addr, slen); len -= slen; } @@ -1437,16 +1464,21 @@ static inline send_routine get_send_routine(struct rvt_qp *qp, case IB_QPT_GSI: case IB_QPT_UD: break; + case IB_QPT_RC: case IB_QPT_UC: - case IB_QPT_RC: { + /* + * RC QPs which support TID RDMA could use PIO for + * TID RDMA WRITE REQ packets. The opcode test should + * allow both valid RC opcodes and TID RDMA WRITE REQ. + */ if (piothreshold && tx->s_cur_size <= min(piothreshold, qp->pmtu) && - (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) && + ((BIT(ps->opcode & OPMASK) & + pio_opmask[ps->opcode >> 5])) && iowait_sdma_pending(&priv->s_iowait) == 0 && !sdma_txreq_built(&tx->txreq)) return dd->process_pio_send; break; - } default: break; } @@ -1529,7 +1561,8 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ps->s_txreq->psc, ps, HFI1_S_WAIT_PIO_DRAIN); - return sr(qp, ps, 0); + ret = sr(qp, ps, 0); + return ret; } /** @@ -1561,7 +1594,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_mr_size = U64_MAX; rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; - rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); rdi->dparms.props.max_send_sge = hfi1_max_sges; rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index d3aba66..6d11e5c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -164,25 +164,51 @@ struct hfi1_qp_priv { u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; + struct timer_list s_tid_timer; /* for timing tid wait */ + struct timer_list s_tid_retry_timer; /* for timing tid ack */ struct list_head tid_wait; /* for queueing tid space */ struct hfi1_opfn_data opfn; struct tid_flow_state flow_state; struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */ + atomic_t n_requests; /* # of TID RDMA requests in the */ + /* queue */ + atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; unsigned long tid_retry_timeout_jiffies; /* variables for the TID RDMA SE state machine */ + u8 s_state; + u8 s_nak_state; u8 s_retry; u32 s_flags; + u32 s_tid_cur; + u32 s_tid_head; + u32 s_tid_tail; + u32 r_tid_head; /* Most recently added TID RDMA request */ + u32 r_tid_tail; /* the last completed TID RDMA request */ + u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ + u32 r_tid_alloc; /* Request for which we are allocating resources */ + u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 pending_tid_w_resp; /* Num of pending tid write responses */ + u32 alloc_w_segs; /* Number of segments for which write */ + /* resources have been allocated for this QP */ /* For TID RDMA READ */ u32 tid_r_reqs; /* Num of tid reads requested */ u32 tid_r_comp; /* Num of tid reads completed */ u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ + u8 rnr_nak_state; /* RNR NAK state */ + u32 r_next_psn_ib; + u32 r_next_psn_kdeth; + bool sync_pt; /* Set when QP reaches sync point */ + bool resync; }; +#define HFI1_QP_WQE_INVALID ((u32)-1) + struct hfi1_swqe_priv { struct tid_rdma_request tid_req; u32 flags; @@ -190,6 +216,8 @@ struct hfi1_swqe_priv { }; struct hfi1_ack_priv { + struct rvt_sge_state ss; /* used for TID WRITE RESP */ + struct rvt_sge sge; /* used for TID WRITE RESP */ struct tid_rdma_request tid_req; }; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 7f86b4c..67fbe72 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1465,11 +1465,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp) kref_put(&qp->ip->ref, rvt_release_mmap_info); else vfree(qp->r_rq.wq); - vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + vfree(qp->s_wq); kfree(qp); return 0; } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 8a80526..6fc662c 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -246,6 +246,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_USE_RESERVE 0x00000010 +#define RVT_OPERATION_IGN_RNR_CNT 0x00000020 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html