On 5/13/22 22:04, Bob Pearson wrote: > Currently the rxe driver sets a timer to handle response timeouts > from lost packets or other failures. However this timer is never > deleted so it will always fire in the period it was set for. > Under heavy load this means that there will be retry flows attempted > every few msec depending on the setting for retry timeout. > > This patch modifies the driver in several ways to reduce the > number of spurious timeouts: > > - The psn of the current packet is saved when the timer is set > for non-read requests and for read requests a psn in the > expected response is saved. When a response packet is received > by the completer tasklet that has a psn >= to the saved psn > the timer is deleted. > - As soon as there is no pending timer any new request will > reset the timer. For long read replies the timer is reset > as blocks of data arrive so that the response is covered by > a timer. > > For typical heavy loads (e.g. ib_send_bw etc.) the number of > retries is reduced from ~40 retries per second to one every > several seconds. These are legitimate timeouts where the responder > does not reply in the given time. > > Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx> > --- > drivers/infiniband/sw/rxe/rxe_comp.c | 50 ++++++++++++++++++--------- > drivers/infiniband/sw/rxe/rxe_req.c | 26 ++++++++++++-- > drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + > 3 files changed, 57 insertions(+), 20 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c > index 3c77201c01d1..a8ef1f781a24 100644 > --- a/drivers/infiniband/sw/rxe/rxe_comp.c > +++ b/drivers/infiniband/sw/rxe/rxe_comp.c > @@ -170,12 +170,44 @@ static inline void reset_retry_counters(struct rxe_qp *qp) > qp->comp.started_retry = 0; > } > > +static void update_retransmit_timer(struct rxe_qp *qp, > + struct rxe_pkt_info *pkt, > + struct rxe_send_wqe *wqe) > +{ > + /* reset the retry timer for long read responses > + * if there is more data expected > + */ > + if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) { Oops. This is the wrong opcode. Should be one of the read replies. Bob > + int psn = (pkt->psn + RXE_MAX_PKT_PER_ACK) & 0xffffff; > + > + if (psn_compare(psn, wqe->last_psn) > 0) > + psn = wqe->last_psn; > + > + if (psn_compare(psn, pkt->psn) > 0) { > + atomic_set(&qp->timeout_psn, psn); > + mod_timer(&qp->retrans_timer, > + jiffies + qp->timeout_jiffies); > + return; > + } > + } > + > + /* else just delete the timer */ > + del_timer_sync(&qp->retrans_timer); > +} > + > static inline enum comp_state check_psn(struct rxe_qp *qp, > struct rxe_pkt_info *pkt, > struct rxe_send_wqe *wqe) > { > s32 diff; > > + /* if this packet psn matches or exceeds the request that set the > + * retry timer update the timer. > + */ > + if ((psn_compare(pkt->psn, atomic_read(&qp->timeout_psn)) >= 0) && > + timer_pending(&qp->retrans_timer)) > + update_retransmit_timer(qp, pkt, wqe); > + > /* check to see if response is past the oldest WQE. if it is, complete > * send/write or error read/atomic > */ > @@ -663,20 +695,6 @@ int rxe_completer(void *arg) > break; > } > > - /* re reset the timeout counter if > - * (1) QP is type RC > - * (2) the QP is alive > - * (3) there is a packet sent by the requester that > - * might be acked (we still might get spurious > - * timeouts but try to keep them as few as possible) > - * (4) the timeout parameter is set > - */ > - if ((qp_type(qp) == IB_QPT_RC) && > - (qp->req.state == QP_STATE_READY) && > - (psn_compare(qp->req.psn, qp->comp.psn) > 0) && > - qp->timeout_jiffies) > - mod_timer(&qp->retrans_timer, > - jiffies + qp->timeout_jiffies); > ret = -EAGAIN; > goto done; > > @@ -684,9 +702,7 @@ int rxe_completer(void *arg) > /* we come here if the retry timer fired and we did > * not receive a response packet. try to retry the send > * queue if that makes sense and the limits have not > - * been exceeded. remember that some timeouts are > - * spurious since we do not reset the timer but kick > - * it down the road or let it expire > + * been exceeded. > */ > > /* there is nothing to retry in this case */ > diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c > index aa9066ff5257..5e031661bc49 100644 > --- a/drivers/infiniband/sw/rxe/rxe_req.c > +++ b/drivers/infiniband/sw/rxe/rxe_req.c > @@ -397,7 +397,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, > qp->attr.dest_qp_num; > > ack_req = ((pkt->mask & RXE_END_MASK) || > - (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); > + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK) || > + (qp->timeout_jiffies && > + !timer_pending(&qp->retrans_timer))); > if (ack_req) { > qp->req.noack_pkts = 0; > pkt->mask |= RXE_ACK_REQ_MASK; > @@ -545,10 +547,28 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, > > qp->need_req_skb = 0; > > - if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies && > - !timer_pending(&qp->retrans_timer)) > + /* reset the retry timer if the packet has the ackreq bit set, > + * we are using the retry timer and there is no timer set > + * currently > + */ > + if (pkt->mask & RXE_ACK_REQ_MASK && qp->timeout_jiffies && > + !timer_pending(&qp->retrans_timer)) { > + int psn = pkt->psn; > + > + /* for read ops delay matching psn by RXE_MAX_PKT_PER_ACK > + * up to the last psn. The completer will also reset the > + * retry timer as required > + */ > + if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) { > + psn = (psn + RXE_MAX_PKT_PER_ACK) & 0xffffff; > + if (psn_compare(psn, wqe->last_psn) > 0) > + psn = wqe->last_psn; > + } > + > + atomic_set(&qp->timeout_psn, psn); > mod_timer(&qp->retrans_timer, > jiffies + qp->timeout_jiffies); > + } > } > > static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h > index a6c6f0d786c7..c41092d790f0 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > @@ -252,6 +252,7 @@ struct rxe_qp { > */ > struct timer_list retrans_timer; > u64 timeout_jiffies; > + atomic_t timeout_psn; > > /* Timer for handling RNR NAKS. */ > struct timer_list rnr_nak_timer;