[PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently the rxe driver sets a timer to handle response timeouts
from lost packets or other failures. However this timer is never
deleted so it will always fire in the period it was set for.
Under heavy load this means that there will be retry flows attempted
every few msec depending on the setting for retry timeout.

This patch modifies the driver in several ways to reduce the
number of spurious timeouts:

 - The psn of the current packet is saved when the timer is set
   for non-read requests and for read requests a psn in the
   expected response is saved. When a response packet is received
   by the completer tasklet that has a psn >= to the saved psn
   the timer is deleted.
 - As soon as there is no pending timer any new request will
   reset the timer. For long read replies the timer is reset
   as blocks of data arrive so that the response is covered by
   a timer.

For typical heavy loads (e.g. ib_send_bw etc.) the number of
retries is reduced from ~40 retries per second to one every
several seconds. These are legitimate timeouts where the responder
does not reply in the given time.

Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx>
---
 drivers/infiniband/sw/rxe/rxe_comp.c  | 50 ++++++++++++++++++---------
 drivers/infiniband/sw/rxe/rxe_req.c   | 26 ++++++++++++--
 drivers/infiniband/sw/rxe/rxe_verbs.h |  1 +
 3 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index 3c77201c01d1..a8ef1f781a24 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -170,12 +170,44 @@ static inline void reset_retry_counters(struct rxe_qp *qp)
 	qp->comp.started_retry = 0;
 }
 
+static void update_retransmit_timer(struct rxe_qp *qp,
+				    struct rxe_pkt_info *pkt,
+				    struct rxe_send_wqe *wqe)
+{
+	/* reset the retry timer for long read responses
+	 * if there is more data expected
+	 */
+	if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {
+		int psn = (pkt->psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
+
+		if (psn_compare(psn, wqe->last_psn) > 0)
+			psn = wqe->last_psn;
+
+		if (psn_compare(psn, pkt->psn) > 0) {
+			atomic_set(&qp->timeout_psn, psn);
+			mod_timer(&qp->retrans_timer,
+				  jiffies + qp->timeout_jiffies);
+			return;
+		}
+	}
+
+	/* else just delete the timer */
+	del_timer_sync(&qp->retrans_timer);
+}
+
 static inline enum comp_state check_psn(struct rxe_qp *qp,
 					struct rxe_pkt_info *pkt,
 					struct rxe_send_wqe *wqe)
 {
 	s32 diff;
 
+	/* if this packet psn matches or exceeds the request that set the
+	 * retry timer update the timer.
+	 */
+	if ((psn_compare(pkt->psn, atomic_read(&qp->timeout_psn)) >= 0) &&
+	    timer_pending(&qp->retrans_timer))
+		update_retransmit_timer(qp, pkt, wqe);
+
 	/* check to see if response is past the oldest WQE. if it is, complete
 	 * send/write or error read/atomic
 	 */
@@ -663,20 +695,6 @@ int rxe_completer(void *arg)
 				break;
 			}
 
-			/* re reset the timeout counter if
-			 * (1) QP is type RC
-			 * (2) the QP is alive
-			 * (3) there is a packet sent by the requester that
-			 *     might be acked (we still might get spurious
-			 *     timeouts but try to keep them as few as possible)
-			 * (4) the timeout parameter is set
-			 */
-			if ((qp_type(qp) == IB_QPT_RC) &&
-			    (qp->req.state == QP_STATE_READY) &&
-			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
-			    qp->timeout_jiffies)
-				mod_timer(&qp->retrans_timer,
-					  jiffies + qp->timeout_jiffies);
 			ret = -EAGAIN;
 			goto done;
 
@@ -684,9 +702,7 @@ int rxe_completer(void *arg)
 			/* we come here if the retry timer fired and we did
 			 * not receive a response packet. try to retry the send
 			 * queue if that makes sense and the limits have not
-			 * been exceeded. remember that some timeouts are
-			 * spurious since we do not reset the timer but kick
-			 * it down the road or let it expire
+			 * been exceeded.
 			 */
 
 			/* there is nothing to retry in this case */
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index aa9066ff5257..5e031661bc49 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -397,7 +397,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
 					 qp->attr.dest_qp_num;
 
 	ack_req = ((pkt->mask & RXE_END_MASK) ||
-		(qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+		   (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK) ||
+		   (qp->timeout_jiffies &&
+				!timer_pending(&qp->retrans_timer)));
 	if (ack_req) {
 		qp->req.noack_pkts = 0;
 		pkt->mask |= RXE_ACK_REQ_MASK;
@@ -545,10 +547,28 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 
 	qp->need_req_skb = 0;
 
-	if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies &&
-	    !timer_pending(&qp->retrans_timer))
+	/* reset the retry timer if the packet has the ackreq bit set,
+	 * we are using the retry timer and there is no timer set
+	 * currently
+	 */
+	if (pkt->mask & RXE_ACK_REQ_MASK && qp->timeout_jiffies &&
+	    !timer_pending(&qp->retrans_timer)) {
+		int psn = pkt->psn;
+
+		/* for read ops delay matching psn by RXE_MAX_PKT_PER_ACK
+		 * up to the last psn. The completer will also reset the
+		 * retry timer as required
+		 */
+		if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {
+			psn = (psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
+			if (psn_compare(psn, wqe->last_psn) > 0)
+				psn = wqe->last_psn;
+		}
+
+		atomic_set(&qp->timeout_psn, psn);
 		mod_timer(&qp->retrans_timer,
 			  jiffies + qp->timeout_jiffies);
+	}
 }
 
 static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index a6c6f0d786c7..c41092d790f0 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -252,6 +252,7 @@ struct rxe_qp {
 	 */
 	struct timer_list	retrans_timer;
 	u64			timeout_jiffies;
+	atomic_t		timeout_psn;
 
 	/* Timer for handling RNR NAKS. */
 	struct timer_list	rnr_nak_timer;
-- 
2.34.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux