From: Kaike Wan <kaike.wan@xxxxxxxxx> This locking mechanism is designed to provent vavious memory corruption scenarios from occurring when requests are pipelined, especially when RDMA READ/WRITE requests are interleaved with TID RDMA READ/WRITE requests: 1. READ-AFTER-READ; 2. READ-AFTER-WRITE; 3. WRITE-AFTER-READ; 4. WRITE-AFTER-WRITE. When memory corruption is likely, a request will be held back until previous requests have been completed. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx> Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/rc.c | 22 ++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.c | 76 +++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 + 3 files changed, 101 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 7bc635b..a65dc46 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -172,6 +172,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, } e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + /* Check for tid write fence */ + if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_ack_interlock(qp, e)) { + iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); + goto bail; + } if (e->opcode == OP(RDMA_READ_REQUEST)) { /* * If a RDMA read response is being resent and @@ -573,6 +579,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) len = wqe->length; ss = &qp->s_sge; bth2 = mask_psn(qp->s_psn); + + /* + * Interlock between various IB requests and TID RDMA + * if necessary. + */ + if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_wqe_interlock(qp, wqe)) + goto bail; + switch (wqe->wr.opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: @@ -1556,6 +1571,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); } done: + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; qp->s_psn = psn; /* * Set RVT_S_WAIT_PSN as rc_complete() may start the timer @@ -1828,6 +1844,8 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, struct hfi1_ibport *ibp) { + struct hfi1_qp_priv *priv = qp->priv; + lockdep_assert_held(&qp->s_lock); /* * Don't decrement refcount and don't generate a @@ -1904,6 +1922,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, qp->s_draining = 0; wqe = rvt_get_swqe_ptr(qp, qp->s_acked); } + if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; + hfi1_schedule_send(qp); + } return wqe; } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 0fd823c..932ef05 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3895,6 +3895,82 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, goto unlock; } +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct rvt_swqe *prev; + struct tid_rdma_request *req; + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; + prev = rvt_get_swqe_ptr(qp, s_prev); + + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_RDMA_WRITE: + switch (prev->wr.opcode) { + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + default: + break; + } + case IB_WR_RDMA_READ: + if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + /* fall through */ + case IB_WR_TID_RDMA_READ: + switch (prev->wr.opcode) { + case IB_WR_RDMA_READ: + if (qp->s_acked != qp->s_cur) + goto interlock; + break; + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + default: + break; + } + default: + break; + } + return false; + +interlock: + priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; + return true; +} + +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) +{ + struct rvt_ack_entry *prev; + struct tid_rdma_request *req; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : + (qp->s_tail_ack_queue - 1); + prev = &qp->s_ack_queue[s_prev]; + + if ((e->opcode == TID_OP(READ_REQ) || + e->opcode == OP(RDMA_READ_REQUEST)) && + prev->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(prev); + if (req->ack_seg != req->total_segs) { + priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; + return true; + } + } + return false; +} + static void hfi1_tid_retry_timeout(struct timer_list *t) { struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 89384fb..9a0c353 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -291,6 +291,9 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, struct hfi1_packet *packet); +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe); +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e); + void hfi1_del_tid_reap_timer(struct rvt_qp *qp); void hfi1_add_tid_retry_timer(struct rvt_qp *qp); -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html