This patch modifies read_reply() in rxe_resp.c to retry the send if err == -EAGAIN. When IP does drop a packet it requires more time to recover than a simple retry takes so a subroutine read_retry_delay() is added that dynamically estimates the time required for this recovery and inserts a delay before the retry. Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx> --- drivers/infiniband/sw/rxe/rxe_resp.c | 62 +++++++++++++++++++++++++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 9 ++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index cd2d88de287c..4e2fa2d72e70 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -863,6 +863,57 @@ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) return mr; } +/* Compute the delay to insert before retrying sending a + * dropped read reply packet in microseconds. Compute as half + * the average burst delay over the last 128 delay bursts. + * Slowly decay the delay if many good packets are seen. + */ +static int read_retry_delay(struct rxe_qp *qp, int err) +{ + struct tune_read_drop *tune = &qp->resp.tune_read_drop; + u32 delay = tune->delay; + u32 num = tune->num_bursts; + u32 good = tune->num_good_pkts; + u32 burst = tune->burst_delay; + u32 tot = tune->total_delay; + + if (err == -EAGAIN) { + burst += delay; + good = 0; + } else if (burst) { + tot += burst; + burst = 0; + num++; + } else { + good++; + } + + if (num >= (1 << 7)) { + delay = tot >> 8; + tot = 0; + num = 0; + rxe_dbg_qp(qp, "delay = %d", delay); + } + + if (delay > 1 && good > 512) { + good = 0; + delay--; + } + + /* make sure delay is at least 1 else algorithm breaks + * with tot = burst = 0 -> delay = 0 + */ + delay = delay ?: 1; + + tune->delay = delay; + tune->num_bursts = num; + tune->num_good_pkts = good; + tune->burst_delay = burst; + tune->total_delay = tot; + + return delay; +} + /* RDMA read response. If res is not NULL, then we have a current RDMA request * being processed or replayed. */ @@ -878,6 +929,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, int err; struct resp_res *res = qp->resp.res; struct rxe_mr *mr; + int delay; if (!res) { res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK); @@ -909,8 +961,6 @@ static enum resp_states read_reply(struct rxe_qp *qp, opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST; } - res->state = rdatm_res_state_next; - payload = min_t(int, res->read.resid, mtu); skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, @@ -937,9 +987,15 @@ static enum resp_states read_reply(struct rxe_qp *qp, } err = rxe_xmit_packet(qp, &ack_pkt, skb); - if (err) + delay = read_retry_delay(qp, err); + if (err == -EAGAIN) { + udelay(delay); + return RESPST_READ_REPLY; + } else if (err) { return RESPST_ERR_RNR; + } + res->state = rdatm_res_state_next; res->read.va += payload; res->read.resid -= payload; res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index c269ae2a3224..84994a474e9a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -203,6 +203,15 @@ struct rxe_resp_info { struct ib_sge sge[RXE_MAX_SGE]; } srq_wqe; + /* dynamic delay tuning for read reply drops */ + struct tune_read_drop { + u32 total_delay; + u32 burst_delay; + u32 num_bursts; + u32 num_good_pkts; + u32 delay; + } tune_read_drop; + /* Responder resources. It's a circular list where the oldest * resource is dropped first. */ -- 2.37.2