[PATCH for-next 11/24] IB/hfi1: TID RDMA flow allocation

Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> · Mon, 02 Jul 2018 08:23:41 -0700

From: Kaike Wan <kaike.wan@xxxxxxxxx>

The hfi1 hardware flow is a hardware flow-control mechanism for a KDETH
data packet that is received on a hfi1 port. It validates the packet by
checking both the generation and sequence. Each QP that uses the TID RDMA
mechanism will allocate a hardware flow from its receiving context for
any incoming KDETH data packets.

This patch implements:
(1) functions to allocate hardware flow
(2) functions to free hardware flow
(3) a function to initialize hardware flow generation for a receiving
    context
(4) functions to allocate software flows for requests when qpriv is
    initialized. Software flows are used to track hardware flow
    information for each data segment of a TID RDMA request
(5) functions to free software flows for requests when qpriv is freed

Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@xxxxxxxxx>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx>
Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx>
---
 drivers/infiniband/hw/hfi1/init.c     |   11 +-
 drivers/infiniband/hw/hfi1/tid_rdma.c |  238 +++++++++++++++++++++++++++++++++
 2 files changed, 247 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 758d273..bdd2ad6 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -67,7 +67,7 @@
 #include "aspm.h"
 #include "affinity.h"
 #include "vnic.h"
-#include "exp_rcv.h"
+#include "user_exp_rcv.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -369,6 +369,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
 		rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
 
 		mutex_init(&rcd->exp_mutex);
+		spin_lock_init(&rcd->exp_lock);
+		INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+		INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
 
 		hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
 
@@ -471,6 +474,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
 						    GFP_KERNEL, numa);
 			if (!rcd->opstats)
 				goto bail;
+
+			/* Initialize TID flow generations for the context */
+			hfi1_kern_init_ctxt_generations(rcd);
 		}
 
 		*context = rcd;
@@ -773,6 +779,8 @@ static void enable_chip(struct hfi1_devdata *dd)
 			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
 		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
 			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+		if (HFI1_CAP_IS_KSET(TID_RDMA))
+			rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
 		hfi1_rcvctrl(dd, rcvmask, rcd);
 		sc_enable(rcd->sc);
 		hfi1_rcd_put(rcd);
@@ -1474,6 +1482,7 @@ static int __init hfi1_mod_init(void)
 	/* sanitize link CRC options */
 	link_crc_mask &= SUPPORTED_CRCS;
 
+	hfi1_compute_tid_rdma_flow_wt();
 	/*
 	 * These must be called before the driver is registered with
 	 * the PCI subsystem.
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index aefa23c..c8e2c40 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -53,6 +53,7 @@
 #include "tid_rdma.h"
 #include "user_exp_rcv.h"
 #include "trace.h"
+#include <rdma/ib_umem.h>
 
 /**
  * DOC: TID RDMA READ protocol
@@ -149,6 +150,8 @@ static u32 mask_generation(u32 a)
  * C - Capcode
  */
 
+static u32 tid_rdma_flow_wt;
+
 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
 					  u32 psn, u16 *fidx)
 {
@@ -319,14 +322,161 @@ static void tid_rdma_trigger_resume(struct work_struct *work)
 {
 }
 
+void hfi1_compute_tid_rdma_flow_wt(void)
+{
+	/*
+	 * Heuristic for computing the RNR timeout when waiting on the flow
+	 * queue. Rather than a computationaly expensive exact estimate of when
+	 * a flow will be available, we assume that if a QP is at position N in
+	 * the flow queue it has to wait approximately (N + 1) * (number of
+	 * segments between two sync points), assuming PMTU of 4K. The rationale
+	 * for this is that flows are released and recycled at each sync point.
+	 */
+	tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
+		TID_RDMA_MAX_SEGMENT_SIZE;
+}
+
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ *         signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+	__must_hold(&rcd->exp_lock)
+{
+	int nr;
+
+	/* Attempt to reserve the preferred flow index */
+	if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+	    !test_and_set_bit(last, &rcd->flow_mask))
+		return last;
+
+	nr = ffz(rcd->flow_mask);
+	BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+		     (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+	if (nr > (RXE_NUM_TID_FLOWS - 1))
+		return -EAGAIN;
+	set_bit(nr, &rcd->flow_mask);
+	return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+			     u32 flow_idx)
+{
+	u64 reg;
+
+	reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+		RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+		RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+		RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+		RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+		RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+	if (generation != KERN_GENERATION_RESERVED)
+		reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+	write_uctxt_csr(rcd->dd, rcd->ctxt,
+			RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+	__must_hold(&rcd->exp_lock)
+{
+	u32 generation = rcd->flows[flow_idx].generation;
+
+	kern_set_hw_flow(rcd, generation, flow_idx);
+	return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+	u32 generation = mask_generation(gen + 1);
+
+	if (generation == KERN_GENERATION_RESERVED)
+		generation = mask_generation(generation + 1);
+	return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+	__must_hold(&rcd->exp_lock)
+{
+	rcd->flows[flow_idx].generation =
+		kern_flow_generation_next(rcd->flows[flow_idx].generation);
+	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
 static int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd,
 				   struct rvt_qp *qp)
 {
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	unsigned long flags;
+	int ret = 0;
+
+	/* The QP already has an allocated flow */
+	if (fs->index != RXE_NUM_TID_FLOWS)
+		return ret;
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+
+	ret = kern_reserve_flow(rcd, fs->last_index);
+	if (ret < 0)
+		goto queue;
+	fs->index = ret;
+	fs->last_index = fs->index;
+
+	/* Generation received in a RESYNC overrides default flow generation */
+	if (fs->generation != KERN_GENERATION_RESERVED)
+		rcd->flows[fs->index].generation = fs->generation;
+	fs->generation = kern_setup_hw_flow(rcd, fs->index);
+	fs->psn = 0;
+	fs->flags = 0;
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
 	return 0;
+queue:
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	return -EAGAIN;
 }
 
 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
 {
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	unsigned long flags;
+
+	if (fs->index >= RXE_NUM_TID_FLOWS)
+		return;
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	kern_clear_hw_flow(rcd, fs->index);
+	clear_bit(fs->index, &rcd->flow_mask);
+	fs->index = RXE_NUM_TID_FLOWS;
+	fs->psn = 0;
+	fs->generation = KERN_GENERATION_RESERVED;
+
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+	int i;
+
+	for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+		rcd->flows[i].generation = mask_generation(prandom_u32());
+		kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+	}
 }
 
 static int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
@@ -346,6 +496,71 @@ void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
 {
 }
 
+static void hfi1_kern_exp_rcv_dealloc(struct tid_rdma_flow *flow)
+{
+	kfree(flow->fstate);
+	flow->fstate = NULL;
+}
+
+static int hfi1_kern_exp_rcv_alloc(struct tid_rdma_flow *flow)
+{
+	flow->fstate = kzalloc(sizeof(*flow->fstate), GFP_ATOMIC);
+	if (!flow->fstate)
+		goto nomem;
+
+	return 0;
+nomem:
+	hfi1_kern_exp_rcv_dealloc(flow);
+	return -ENOMEM;
+}
+
+/* Called at QP destroy time to free TID RDMA resources */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+	int i;
+
+	for (i = 0; req->flows && i < req->n_max_flows; i++)
+		hfi1_kern_exp_rcv_dealloc(&req->flows[i]);
+
+	kfree(req->flows);
+	req->flows = NULL;
+	req->n_max_flows = 0;
+	req->n_flows = 0;
+}
+
+/*
+ * This is called at QP create time to allocate resources for TID RDMA
+ * segments/flows. This is done to keep all required memory pre-allocated and
+ * avoid memory allocation in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req)
+{
+	struct tid_rdma_flow *flows;
+	int i, ret;
+	u16 nflows;
+
+	/* Size of the flow circular buffer is the next higher power of 2 */
+	nflows = max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ,
+		       TID_RDMA_MAX_WRITE_SEGS_PER_REQ);
+	req->n_max_flows = roundup_pow_of_two(nflows + 1);
+	flows = kcalloc(req->n_max_flows, sizeof(*flows), GFP_KERNEL);
+	if (!flows) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	req->flows = flows;
+
+	for (i = 0; i < req->n_max_flows; i++) {
+		ret = hfi1_kern_exp_rcv_alloc(&req->flows[i]);
+		if (ret)
+			goto err;
+	}
+	return 0;
+err:
+	hfi1_kern_exp_rcv_free_flows(req);
+	return ret;
+}
+
 /*
  * Validate and accept the TID RDMA READ request parameters.
  * Return 0 if the request is accepted successfully;
@@ -884,9 +1099,14 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		      struct ib_qp_init_attr *init_attr)
 {
 	struct hfi1_qp_priv *qpriv = qp->priv;
-	int i;
+	int i, ret;
 
+	BUILD_BUG_ON(TID_RDMA_MAX_SEGMENT_SIZE / PAGE_SIZE > U8_MAX);
 	qpriv->rcd = qp_to_rcd(rdi, qp);
+	qpriv->flow_state.psn = 0;
+	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
 
 	spin_lock_init(&qpriv->opfn.lock);
 	INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
@@ -912,6 +1132,12 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 			if (!priv)
 				return -ENOMEM;
 
+			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req);
+			if (ret) {
+				kfree(priv);
+				return ret;
+			}
+
 			/*
 			 * Initialize various TID RDMA request variables.
 			 * These variables are "static", which is why they
@@ -934,6 +1160,12 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 			if (!priv)
 				return -ENOMEM;
 
+			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req);
+			if (ret) {
+				kfree(priv);
+				return ret;
+			}
+
 			priv->tid_req.qp = qp;
 			priv->tid_req.rcd = qpriv->rcd;
 			priv->tid_req.e.ack = &qp->s_ack_queue[i];
@@ -956,12 +1188,16 @@ void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 
 			wqe = rvt_get_swqe_ptr(qp, i);
 			priv = wqe->priv;
+			if (priv)
+				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
 			kfree(priv);
 			wqe->priv = NULL;
 		}
 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
 			struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
 
+			if (priv)
+				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
 			kfree(priv);
 			qp->s_ack_queue[i].priv = NULL;
 		}

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html