From: Kaike Wan <kaike.wan@xxxxxxxxx> This patch enables TID RDMA protocol by converting a qualified RDMA READ/WRITE request into a TID RDMA READ/WRITE request internally: (1) The TID RDMA cability must be enabled; (2) The request must start on a 4K page boundary; For read, all receiving buffers must start on 4K page boundaries; (3) The request length must be a multiple of 4K and must be larger or equal to 256K. For read, each receiving buffer length must be a multiple of 4K. Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/qp.c | 16 ++++-- drivers/infiniband/hw/hfi1/tid_rdma.c | 82 +++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 11 ++++ drivers/infiniband/hw/hfi1/verbs.c | 2 - drivers/infiniband/hw/hfi1/verbs.h | 2 - drivers/infiniband/hw/qib/qib_verbs.c | 2 - drivers/infiniband/sw/rdmavt/qp.c | 32 +++++++++---- include/rdma/rdma_vt.h | 18 ++++++- 8 files changed, 142 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index c762a98..bf2275e 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -304,26 +304,28 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, } /** - * hfi1_check_send_wqe - validate wqe + * hfi1_setup_wqe - setup the wqe * @qp - The qp * @wqe - The built wqe * - * validate wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. + * Perform setup of the wqe. This is called + * prior to inserting the wqe into the ring but after + * the wqe has been setup by RDMAVT. This function + * allow the driver the opportunity to perform + * validation and additional setup of the wqe. * * Returns 0 on success, -EINVAL on failure * */ -int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) +int hfi1_setup_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; switch (qp->ibqp.qp_type) { case IB_QPT_RC: + hfi1_setup_tid_rdma_wqe(qp, wqe); case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 932ef05..d27338d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4009,6 +4009,88 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) spin_unlock_irqrestore(&qp->r_lock, flags); } +/* Does @sge meet the alignment requirements for tid rdma? */ +static inline bool hfi1_check_sge_align(struct rvt_sge *sge, int num_sge) +{ + int i; + + for (i = 0; i < num_sge; i++, sge++) + if ((u64)sge->vaddr & ~PAGE_MASK || + sge->sge_length & ~PAGE_MASK) + return false; + return true; +} + +void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct hfi1_swqe_priv *priv = wqe->priv; + struct tid_rdma_params *remote; + bool do_tid_rdma = false; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * If TID RDMA is disabled by the negotiation, don't + * use it. + */ + if (!remote) + goto exit; + + if (wqe->wr.opcode == IB_WR_RDMA_READ) { + if (hfi1_check_sge_align(&wqe->sg_list[0], wqe->wr.num_sge) && + qpriv->tid_rdma.n_read < remote->max_read) { + wqe->wr.opcode = IB_WR_TID_RDMA_READ; + do_tid_rdma = true; + priv->tid_req.n_flows = remote->max_read; + qpriv->tid_r_reqs++; + } + } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + /* + * TID RDMA is enabled for this RDMA WRITE request iff: + * 1. The remote address is page-aligned, + * 2. The length is larger than the minimum segment size, + * 3. The length is page-multiple, and + * 4. There are available TID RDMA WRITEs + * on the remote end. + */ + if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && + !(wqe->length & ~PAGE_MASK) && + qpriv->tid_rdma.n_write < remote->max_write) { + wqe->wr.opcode = IB_WR_TID_RDMA_WRITE; + do_tid_rdma = true; + } + } + + if (do_tid_rdma) { + priv->tid_req.seg_len = + min_t(u32, remote->max_len, wqe->length); + priv->tid_req.total_segs = + DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); + /* Compute the last PSN of the request */ + wqe->lpsn = wqe->psn; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; + } else { + wqe->lpsn += priv->tid_req.total_segs - 1; + atomic_inc(&qpriv->n_requests); + } + priv->tid_req.cur_seg = 0; + priv->tid_req.comp_seg = 0; + priv->tid_req.ack_seg = 0; + priv->tid_req.state = TID_REQUEST_INACTIVE; + /* + * Reset acked_tail. + * TID RDMA READ does not have ACKs so it does not + * update the pointer. We have to reset it so TID RDMA + * WRITE does not get confused. + */ + priv->tid_req.acked_tail = priv->tid_req.setup_head; + } +exit: + rcu_read_unlock(); +} + static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 9a0c353..85696f6 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -299,6 +299,17 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, void hfi1_add_tid_retry_timer(struct rvt_qp *qp); void hfi1_del_tid_retry_timer(struct rvt_qp *qp); +void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (wqe->priv && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_RDMA_WRITE) && + wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE) + setup_tid_rdma_wqe(qp, wqe); +} + bool hfi1_schedule_tid_send(struct rvt_qp *qp); int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 8bebf6b..d0eb34d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -2134,9 +2134,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; - dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = hfi1_comp_vect_mappings_lookup; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; /* completeion queue */ dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 8ef9ae6..28a7610 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -433,7 +433,7 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index a679940..26ab78e 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1588,7 +1588,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; - dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe; dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah; dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn; dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 67fbe72..f0fe058 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1831,15 +1831,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.num_sge = j; } - /* general part of wqe valid - allow for driver checks */ - if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe); - if (ret < 0) - goto bail_inval_free; - if (ret) - *call_send = ret; - } - + /* + * Calculate and set SWQE PSN values prior to handing it off + * to the driver's check routine. This give the driver the + * opportunity to adjust PSN values based on internal checks. + */ log_pmtu = qp->log_pmtu; if (qp->ibqp.qp_type != IB_QPT_UC && qp->ibqp.qp_type != IB_QPT_RC) { @@ -1864,8 +1860,20 @@ static int rvt_post_one_wr(struct rvt_qp *qp, (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; } + + /* general part of wqe valid - allow for driver checks */ + if (rdi->driver_f.setup_wqe) { + ret = rdi->driver_f.setup_wqe(qp, wqe); + if (ret < 0) + goto bail_inval_free_ref; + if (ret) + *call_send = ret; + } + + if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) + qp->s_next_psn = wqe->lpsn + 1; + if (unlikely(reserved_op)) { wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; rvt_qp_wqe_reserve(qp, wqe); @@ -1879,6 +1887,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, return 0; +bail_inval_free_ref: + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) + atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); bail_inval_free: /* release mr holds */ while (j) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index b1c3448..762bf52 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -203,7 +203,11 @@ struct rvt_driver_provided { * check_support() for details. */ - /* hot path calldowns in a single cacheline */ + /* + * hot path calldowns in a single cacheline + * + * Add data path calls below + */ /* * Give the driver a notice that there is send work to do. It is up to @@ -215,8 +219,14 @@ struct rvt_driver_provided { bool (*schedule_send)(struct rvt_qp *qp); bool (*schedule_send_no_lock)(struct rvt_qp *qp); - /* Driver specific work request checking */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + /* + * Driver specific work request setup and checking. + * This function is allowed to perform any setup, checks, or + * adjustments required to the SWQE in order to be usable by + * underlying protocols. This includes private data structure + * allocations. + */ + int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); /* * Sometimes rdmavt needs to kick the driver's send progress. That is @@ -224,6 +234,8 @@ struct rvt_driver_provided { */ void (*do_send)(struct rvt_qp *qp); + /* end of hot path calldowns */ + /* Passed to ib core registration. Callback to create syfs files */ int (*port_callback)(struct ib_device *, u8, struct kobject *); -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html