[PATCH for-next 23/24] IB/hfi1: Enable TID RDMA protocol

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Kaike Wan <kaike.wan@xxxxxxxxx>

This patch enables TID RDMA protocol by converting a qualified RDMA
READ/WRITE request into a TID RDMA READ/WRITE request internally:
(1) The TID RDMA cability must be enabled;
(2) The request must start on a 4K page boundary; For read, all receiving
buffers must start on 4K page boundaries;
(3) The request length must be a multiple of 4K and must be larger or
equal to 256K. For read, each receiving buffer length must be a multiple
of 4K.

Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx>
Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx>
---
 drivers/infiniband/hw/hfi1/qp.c       |   16 ++++--
 drivers/infiniband/hw/hfi1/tid_rdma.c |   82 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/tid_rdma.h |   11 ++++
 drivers/infiniband/hw/hfi1/verbs.c    |    2 -
 drivers/infiniband/hw/hfi1/verbs.h    |    2 -
 drivers/infiniband/hw/qib/qib_verbs.c |    2 -
 drivers/infiniband/sw/rdmavt/qp.c     |   32 +++++++++----
 include/rdma/rdma_vt.h                |   18 ++++++-
 8 files changed, 142 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index c762a98..bf2275e 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -304,26 +304,28 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 }
 
 /**
- * hfi1_check_send_wqe - validate wqe
+ * hfi1_setup_wqe - setup the wqe
  * @qp - The qp
  * @wqe - The built wqe
  *
- * validate wqe.  This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
+ * Perform setup of the wqe.  This is called
+ * prior to inserting the wqe into the ring but after
+ * the wqe has been setup by RDMAVT. This function
+ * allow the driver the opportunity to perform
+ * validation and additional setup of the wqe.
  *
  * Returns 0 on success, -EINVAL on failure
  *
  */
-int hfi1_check_send_wqe(struct rvt_qp *qp,
-			struct rvt_swqe *wqe)
+int hfi1_setup_wqe(struct rvt_qp *qp,
+		   struct rvt_swqe *wqe)
 {
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct rvt_ah *ah;
 
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
+		hfi1_setup_tid_rdma_wqe(qp, wqe);
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 932ef05..d27338d 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -4009,6 +4009,88 @@ static void hfi1_tid_retry_timeout(struct timer_list *t)
 	spin_unlock_irqrestore(&qp->r_lock, flags);
 }
 
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_sge *sge, int num_sge)
+{
+	int i;
+
+	for (i = 0; i < num_sge; i++, sge++)
+		if ((u64)sge->vaddr & ~PAGE_MASK ||
+		    sge->sge_length & ~PAGE_MASK)
+			return false;
+	return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct hfi1_swqe_priv *priv = wqe->priv;
+	struct tid_rdma_params *remote;
+	bool do_tid_rdma = false;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	/*
+	 * If TID RDMA is disabled by the negotiation, don't
+	 * use it.
+	 */
+	if (!remote)
+		goto exit;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+		if (hfi1_check_sge_align(&wqe->sg_list[0], wqe->wr.num_sge) &&
+		    qpriv->tid_rdma.n_read < remote->max_read) {
+			wqe->wr.opcode = IB_WR_TID_RDMA_READ;
+			do_tid_rdma = true;
+			priv->tid_req.n_flows = remote->max_read;
+			qpriv->tid_r_reqs++;
+		}
+	} else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+		/*
+		 * TID RDMA is enabled for this RDMA WRITE request iff:
+		 *   1. The remote address is page-aligned,
+		 *   2. The length is larger than the minimum segment size,
+		 *   3. The length is page-multiple, and
+		 *   4. There are available TID RDMA WRITEs
+		 *      on the remote end.
+		 */
+		if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+		    !(wqe->length & ~PAGE_MASK) &&
+		    qpriv->tid_rdma.n_write < remote->max_write) {
+			wqe->wr.opcode = IB_WR_TID_RDMA_WRITE;
+			do_tid_rdma = true;
+		}
+	}
+
+	if (do_tid_rdma) {
+		priv->tid_req.seg_len =
+			min_t(u32, remote->max_len, wqe->length);
+		priv->tid_req.total_segs =
+			DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+		/* Compute the last PSN of the request */
+		wqe->lpsn = wqe->psn;
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+		} else {
+			wqe->lpsn += priv->tid_req.total_segs - 1;
+			atomic_inc(&qpriv->n_requests);
+		}
+		priv->tid_req.cur_seg = 0;
+		priv->tid_req.comp_seg = 0;
+		priv->tid_req.ack_seg = 0;
+		priv->tid_req.state = TID_REQUEST_INACTIVE;
+		/*
+		 * Reset acked_tail.
+		 * TID RDMA READ does not have ACKs so it does not
+		 * update the pointer. We have to reset it so TID RDMA
+		 * WRITE does not get confused.
+		 */
+		priv->tid_req.acked_tail = priv->tid_req.setup_head;
+	}
+exit:
+	rcu_read_unlock();
+}
+
 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index 9a0c353..85696f6 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -299,6 +299,17 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 void hfi1_add_tid_retry_timer(struct rvt_qp *qp);
 void hfi1_del_tid_retry_timer(struct rvt_qp *qp);
 
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
+					   struct rvt_swqe *wqe)
+{
+	if (wqe->priv &&
+	    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+	     wqe->wr.opcode == IB_WR_RDMA_WRITE) &&
+	    wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
+		setup_tid_rdma_wqe(qp, wqe);
+}
+
 bool hfi1_schedule_tid_send(struct rvt_qp *qp);
 
 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 8bebf6b..d0eb34d 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -2134,9 +2134,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
 	dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
 	dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
-	dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
 	dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
 						hfi1_comp_vect_mappings_lookup;
+	dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
 
 	/* completeion queue */
 	dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 8ef9ae6..28a7610 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -433,7 +433,7 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 		    int attr_mask, struct ib_udata *udata);
 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
-int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
 
 extern const u32 rc_only_opcode;
 extern const u32 uc_only_opcode;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index a679940..26ab78e 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1588,7 +1588,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah;
-	dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe;
 	dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah;
 	dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn;
 	dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc;
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 67fbe72..f0fe058 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1831,15 +1831,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 		wqe->wr.num_sge = j;
 	}
 
-	/* general part of wqe valid - allow for driver checks */
-	if (rdi->driver_f.check_send_wqe) {
-		ret = rdi->driver_f.check_send_wqe(qp, wqe);
-		if (ret < 0)
-			goto bail_inval_free;
-		if (ret)
-			*call_send = ret;
-	}
-
+	/*
+	 * Calculate and set SWQE PSN values prior to handing it off
+	 * to the driver's check routine. This give the driver the
+	 * opportunity to adjust PSN values based on internal checks.
+	 */
 	log_pmtu = qp->log_pmtu;
 	if (qp->ibqp.qp_type != IB_QPT_UC &&
 	    qp->ibqp.qp_type != IB_QPT_RC) {
@@ -1864,8 +1860,20 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 				(wqe->length ?
 					((wqe->length - 1) >> log_pmtu) :
 					0);
-		qp->s_next_psn = wqe->lpsn + 1;
 	}
+
+	/* general part of wqe valid - allow for driver checks */
+	if (rdi->driver_f.setup_wqe) {
+		ret = rdi->driver_f.setup_wqe(qp, wqe);
+		if (ret < 0)
+			goto bail_inval_free_ref;
+		if (ret)
+			*call_send = ret;
+	}
+
+	if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
+		qp->s_next_psn = wqe->lpsn + 1;
+
 	if (unlikely(reserved_op)) {
 		wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
 		rvt_qp_wqe_reserve(qp, wqe);
@@ -1879,6 +1887,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 
 	return 0;
 
+bail_inval_free_ref:
+	if (qp->ibqp.qp_type != IB_QPT_UC &&
+	    qp->ibqp.qp_type != IB_QPT_RC)
+		atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
 bail_inval_free:
 	/* release mr holds */
 	while (j) {
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index b1c3448..762bf52 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -203,7 +203,11 @@ struct rvt_driver_provided {
 	 * check_support() for details.
 	 */
 
-	/* hot path calldowns in a single cacheline */
+	/*
+	 * hot path calldowns in a single cacheline
+	 *
+	 * Add data path calls below
+	 */
 
 	/*
 	 * Give the driver a notice that there is send work to do. It is up to
@@ -215,8 +219,14 @@ struct rvt_driver_provided {
 	bool (*schedule_send)(struct rvt_qp *qp);
 	bool (*schedule_send_no_lock)(struct rvt_qp *qp);
 
-	/* Driver specific work request checking */
-	int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
+	/*
+	 * Driver specific work request setup and checking.
+	 * This function is allowed to perform any setup, checks, or
+	 * adjustments required to the SWQE in order to be usable by
+	 * underlying protocols. This includes private data structure
+	 * allocations.
+	 */
+	int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
 
 	/*
 	 * Sometimes rdmavt needs to kick the driver's send progress. That is
@@ -224,6 +234,8 @@ struct rvt_driver_provided {
 	 */
 	void (*do_send)(struct rvt_qp *qp);
 
+	/* end of hot path calldowns */
+
 	/* Passed to ib core registration. Callback to create syfs files */
 	int (*port_callback)(struct ib_device *, u8, struct kobject *);
 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux