Currently the rxe driver sets a timer to handle response timeouts from lost packets or other failures. However this timer is never deleted so it will always fire in the period it was set for. Under heavy load this means that there will be retry flows attempted every few msec depending on the setting for retry timeout. This patch modifies the driver in several ways to reduce the number of spurious timeouts: - The psn of the current packet is saved when the timer is set for non-read requests and for read requests a psn in the expected response is saved. When a response packet is received by the completer tasklet that has a psn >= to the saved psn the timer is deleted. - As soon as there is no pending timer any new request will reset the timer. For long read replies the timer is reset as blocks of data arrive so that the response is covered by a timer. For typical heavy loads (e.g. ib_send_bw etc.) the number of retries is reduced from ~40 retries per second to one every several seconds. These are legitimate timeouts where the responder does not reply in the given time. Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx> --- drivers/infiniband/sw/rxe/rxe_comp.c | 50 ++++++++++++++++++--------- drivers/infiniband/sw/rxe/rxe_req.c | 26 ++++++++++++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 3c77201c01d1..a8ef1f781a24 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -170,12 +170,44 @@ static inline void reset_retry_counters(struct rxe_qp *qp) qp->comp.started_retry = 0; } +static void update_retransmit_timer(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + /* reset the retry timer for long read responses + * if there is more data expected + */ + if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) { + int psn = (pkt->psn + RXE_MAX_PKT_PER_ACK) & 0xffffff; + + if (psn_compare(psn, wqe->last_psn) > 0) + psn = wqe->last_psn; + + if (psn_compare(psn, pkt->psn) > 0) { + atomic_set(&qp->timeout_psn, psn); + mod_timer(&qp->retrans_timer, + jiffies + qp->timeout_jiffies); + return; + } + } + + /* else just delete the timer */ + del_timer_sync(&qp->retrans_timer); +} + static inline enum comp_state check_psn(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_send_wqe *wqe) { s32 diff; + /* if this packet psn matches or exceeds the request that set the + * retry timer update the timer. + */ + if ((psn_compare(pkt->psn, atomic_read(&qp->timeout_psn)) >= 0) && + timer_pending(&qp->retrans_timer)) + update_retransmit_timer(qp, pkt, wqe); + /* check to see if response is past the oldest WQE. if it is, complete * send/write or error read/atomic */ @@ -663,20 +695,6 @@ int rxe_completer(void *arg) break; } - /* re reset the timeout counter if - * (1) QP is type RC - * (2) the QP is alive - * (3) there is a packet sent by the requester that - * might be acked (we still might get spurious - * timeouts but try to keep them as few as possible) - * (4) the timeout parameter is set - */ - if ((qp_type(qp) == IB_QPT_RC) && - (qp->req.state == QP_STATE_READY) && - (psn_compare(qp->req.psn, qp->comp.psn) > 0) && - qp->timeout_jiffies) - mod_timer(&qp->retrans_timer, - jiffies + qp->timeout_jiffies); ret = -EAGAIN; goto done; @@ -684,9 +702,7 @@ int rxe_completer(void *arg) /* we come here if the retry timer fired and we did * not receive a response packet. try to retry the send * queue if that makes sense and the limits have not - * been exceeded. remember that some timeouts are - * spurious since we do not reset the timer but kick - * it down the road or let it expire + * been exceeded. */ /* there is nothing to retry in this case */ diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index aa9066ff5257..5e031661bc49 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -397,7 +397,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, qp->attr.dest_qp_num; ack_req = ((pkt->mask & RXE_END_MASK) || - (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK) || + (qp->timeout_jiffies && + !timer_pending(&qp->retrans_timer))); if (ack_req) { qp->req.noack_pkts = 0; pkt->mask |= RXE_ACK_REQ_MASK; @@ -545,10 +547,28 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, qp->need_req_skb = 0; - if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies && - !timer_pending(&qp->retrans_timer)) + /* reset the retry timer if the packet has the ackreq bit set, + * we are using the retry timer and there is no timer set + * currently + */ + if (pkt->mask & RXE_ACK_REQ_MASK && qp->timeout_jiffies && + !timer_pending(&qp->retrans_timer)) { + int psn = pkt->psn; + + /* for read ops delay matching psn by RXE_MAX_PKT_PER_ACK + * up to the last psn. The completer will also reset the + * retry timer as required + */ + if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) { + psn = (psn + RXE_MAX_PKT_PER_ACK) & 0xffffff; + if (psn_compare(psn, wqe->last_psn) > 0) + psn = wqe->last_psn; + } + + atomic_set(&qp->timeout_psn, psn); mod_timer(&qp->retrans_timer, jiffies + qp->timeout_jiffies); + } } static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index a6c6f0d786c7..c41092d790f0 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -252,6 +252,7 @@ struct rxe_qp { */ struct timer_list retrans_timer; u64 timeout_jiffies; + atomic_t timeout_psn; /* Timer for handling RNR NAKS. */ struct timer_list rnr_nak_timer; -- 2.34.1