Re: [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/13/22 22:04, Bob Pearson wrote:
> Currently the rxe driver sets a timer to handle response timeouts
> from lost packets or other failures. However this timer is never
> deleted so it will always fire in the period it was set for.
> Under heavy load this means that there will be retry flows attempted
> every few msec depending on the setting for retry timeout.
> 
> This patch modifies the driver in several ways to reduce the
> number of spurious timeouts:
> 
>  - The psn of the current packet is saved when the timer is set
>    for non-read requests and for read requests a psn in the
>    expected response is saved. When a response packet is received
>    by the completer tasklet that has a psn >= to the saved psn
>    the timer is deleted.
>  - As soon as there is no pending timer any new request will
>    reset the timer. For long read replies the timer is reset
>    as blocks of data arrive so that the response is covered by
>    a timer.
> 
> For typical heavy loads (e.g. ib_send_bw etc.) the number of
> retries is reduced from ~40 retries per second to one every
> several seconds. These are legitimate timeouts where the responder
> does not reply in the given time.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx>
> ---
>  drivers/infiniband/sw/rxe/rxe_comp.c  | 50 ++++++++++++++++++---------
>  drivers/infiniband/sw/rxe/rxe_req.c   | 26 ++++++++++++--
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  1 +
>  3 files changed, 57 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index 3c77201c01d1..a8ef1f781a24 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -170,12 +170,44 @@ static inline void reset_retry_counters(struct rxe_qp *qp)
>  	qp->comp.started_retry = 0;
>  }
>  
> +static void update_retransmit_timer(struct rxe_qp *qp,
> +				    struct rxe_pkt_info *pkt,
> +				    struct rxe_send_wqe *wqe)
> +{
> +	/* reset the retry timer for long read responses
> +	 * if there is more data expected
> +	 */
> +	if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {

Oops. This is the wrong opcode. Should be one of the read replies.
Bob

> +		int psn = (pkt->psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
> +
> +		if (psn_compare(psn, wqe->last_psn) > 0)
> +			psn = wqe->last_psn;
> +
> +		if (psn_compare(psn, pkt->psn) > 0) {
> +			atomic_set(&qp->timeout_psn, psn);
> +			mod_timer(&qp->retrans_timer,
> +				  jiffies + qp->timeout_jiffies);
> +			return;
> +		}
> +	}
> +
> +	/* else just delete the timer */
> +	del_timer_sync(&qp->retrans_timer);
> +}
> +
>  static inline enum comp_state check_psn(struct rxe_qp *qp,
>  					struct rxe_pkt_info *pkt,
>  					struct rxe_send_wqe *wqe)
>  {
>  	s32 diff;
>  
> +	/* if this packet psn matches or exceeds the request that set the
> +	 * retry timer update the timer.
> +	 */
> +	if ((psn_compare(pkt->psn, atomic_read(&qp->timeout_psn)) >= 0) &&
> +	    timer_pending(&qp->retrans_timer))
> +		update_retransmit_timer(qp, pkt, wqe);
> +
>  	/* check to see if response is past the oldest WQE. if it is, complete
>  	 * send/write or error read/atomic
>  	 */
> @@ -663,20 +695,6 @@ int rxe_completer(void *arg)
>  				break;
>  			}
>  
> -			/* re reset the timeout counter if
> -			 * (1) QP is type RC
> -			 * (2) the QP is alive
> -			 * (3) there is a packet sent by the requester that
> -			 *     might be acked (we still might get spurious
> -			 *     timeouts but try to keep them as few as possible)
> -			 * (4) the timeout parameter is set
> -			 */
> -			if ((qp_type(qp) == IB_QPT_RC) &&
> -			    (qp->req.state == QP_STATE_READY) &&
> -			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
> -			    qp->timeout_jiffies)
> -				mod_timer(&qp->retrans_timer,
> -					  jiffies + qp->timeout_jiffies);
>  			ret = -EAGAIN;
>  			goto done;
>  
> @@ -684,9 +702,7 @@ int rxe_completer(void *arg)
>  			/* we come here if the retry timer fired and we did
>  			 * not receive a response packet. try to retry the send
>  			 * queue if that makes sense and the limits have not
> -			 * been exceeded. remember that some timeouts are
> -			 * spurious since we do not reset the timer but kick
> -			 * it down the road or let it expire
> +			 * been exceeded.
>  			 */
>  
>  			/* there is nothing to retry in this case */
> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> index aa9066ff5257..5e031661bc49 100644
> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> @@ -397,7 +397,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
>  					 qp->attr.dest_qp_num;
>  
>  	ack_req = ((pkt->mask & RXE_END_MASK) ||
> -		(qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
> +		   (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK) ||
> +		   (qp->timeout_jiffies &&
> +				!timer_pending(&qp->retrans_timer)));
>  	if (ack_req) {
>  		qp->req.noack_pkts = 0;
>  		pkt->mask |= RXE_ACK_REQ_MASK;
> @@ -545,10 +547,28 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
>  
>  	qp->need_req_skb = 0;
>  
> -	if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies &&
> -	    !timer_pending(&qp->retrans_timer))
> +	/* reset the retry timer if the packet has the ackreq bit set,
> +	 * we are using the retry timer and there is no timer set
> +	 * currently
> +	 */
> +	if (pkt->mask & RXE_ACK_REQ_MASK && qp->timeout_jiffies &&
> +	    !timer_pending(&qp->retrans_timer)) {
> +		int psn = pkt->psn;
> +
> +		/* for read ops delay matching psn by RXE_MAX_PKT_PER_ACK
> +		 * up to the last psn. The completer will also reset the
> +		 * retry timer as required
> +		 */
> +		if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {
> +			psn = (psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
> +			if (psn_compare(psn, wqe->last_psn) > 0)
> +				psn = wqe->last_psn;
> +		}
> +
> +		atomic_set(&qp->timeout_psn, psn);
>  		mod_timer(&qp->retrans_timer,
>  			  jiffies + qp->timeout_jiffies);
> +	}
>  }
>  
>  static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index a6c6f0d786c7..c41092d790f0 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -252,6 +252,7 @@ struct rxe_qp {
>  	 */
>  	struct timer_list	retrans_timer;
>  	u64			timeout_jiffies;
> +	atomic_t		timeout_psn;
>  
>  	/* Timer for handling RNR NAKS. */
>  	struct timer_list	rnr_nak_timer;




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux