[PATCH for-next 16/24] IB/hfi1: Add the dual leg stub code

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Kaike Wan <kaike.wan@xxxxxxxxx>

The "Second Leg" of the TID RDMA WRITE protocol deals with
the transfer of data and ack packets, which are in the KDETH
PSN space, as opposed to the IB PSN space.

Therefore, the Second Leg could be considered as a separate
state machine. As such, it is handled by a different work
queue item which is scheduled along with the normal IB state
machine work item.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx>
Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx>
Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx>
---
 drivers/infiniband/hw/hfi1/iowait.h   |   12 +++
 drivers/infiniband/hw/hfi1/qp.c       |   34 +++++++-
 drivers/infiniband/hw/hfi1/qp.h       |    1 
 drivers/infiniband/hw/hfi1/ruc.c      |   31 ++++++-
 drivers/infiniband/hw/hfi1/tid_rdma.c |  141 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/tid_rdma.h |    3 +
 drivers/infiniband/hw/hfi1/trace_tx.h |    8 ++
 drivers/infiniband/hw/hfi1/verbs.h    |    5 +
 8 files changed, 223 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index c20a093..1a09d8f 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -186,6 +186,18 @@ static inline bool iowait_schedule(struct iowait *wait,
 }
 
 /**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+				       struct workqueue_struct *wq, int cpu)
+{
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
  * iowait_sdma_drain() - wait for DMAs to drain
  * @wait: iowait structure
  *
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index e7d74ea..0be3def 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -414,6 +414,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
 		if (ret)
 			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 	}
+	if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+		ret = hfi1_schedule_tid_send(qp);
+		if (ret)
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+	}
 }
 
 void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -433,8 +438,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
 
 void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
 {
-	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
 		qp->s_flags &= ~RVT_S_BUSY;
+		/*
+		 * If we are sending a first-leg packet from the second leg,
+		 * we need to clear the busy flag from priv->s_flags to
+		 * avoid a race condition when the qp wakes up before
+		 * the call to hfi1_verbs_send() returns to the second
+		 * leg. In that case, the second leg will terminate without
+		 * being re-scheduled, resulting in failure to send TID RDMA
+		 * WRITE DATA and TID RDMA ACK packets.
+		 */
+		if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+			priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+					   RVT_S_BUSY);
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		}
+	} else {
+		priv->s_flags &= ~RVT_S_BUSY;
+	}
 }
 
 static int iowait_sleep(
@@ -679,7 +703,7 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
 		&priv->s_iowait,
 		1,
 		_hfi1_do_send,
-		NULL,
+		_hfi1_do_tid_send,
 		iowait_sleep,
 		iowait_wakeup,
 		iowait_sdma_drained);
@@ -833,7 +857,8 @@ void notify_error_qp(struct rvt_qp *qp)
 	if (lock) {
 		write_seqlock(lock);
 		if (!list_empty(&priv->s_iowait.list) &&
-		    !(qp->s_flags & RVT_S_BUSY)) {
+		    !(qp->s_flags & RVT_S_BUSY) &&
+		    !(priv->s_flags & RVT_S_BUSY)) {
 			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
@@ -842,7 +867,8 @@ void notify_error_qp(struct rvt_qp *qp)
 		write_sequnlock(lock);
 	}
 
-	if (!(qp->s_flags & RVT_S_BUSY)) {
+	if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+		qp->s_hdrwords = 0;
 		if (qp->s_rdma_mr) {
 			rvt_put_mr(qp->s_rdma_mr);
 			qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index ce25a27..3cb1041 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -78,6 +78,7 @@
 
 #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
 #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
 
 /*
  * Send if not busy or waiting for I/O and either
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 400227c..081a7cf 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -780,8 +780,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 #define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
 
 /**
- * schedule_send_yield - test for a yield required for QP send engine
- * @timeout: Final time for timeout slice for jiffies
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
  * @qp: a pointer to QP
  * @ps: a pointer to a structure with commonly lookup values for
  *      the the send engine progress
@@ -792,8 +792,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
  * returns true if a yield is required, otherwise, false
  * is returned.
  */
-static bool schedule_send_yield(struct rvt_qp *qp,
-				struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp,  struct hfi1_pkt_state *ps,
+			      bool tid)
 {
 	ps->pkts_sent = true;
 
@@ -801,8 +801,23 @@ static bool schedule_send_yield(struct rvt_qp *qp,
 		if (!ps->in_thread ||
 		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
 			spin_lock_irqsave(&qp->s_lock, ps->flags);
-			qp->s_flags &= ~RVT_S_BUSY;
-			hfi1_schedule_send(qp);
+			if (!tid) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				hfi1_schedule_send(qp);
+			} else {
+				struct hfi1_qp_priv *priv = qp->priv;
+
+				if (priv->s_flags &
+				    HFI1_S_TID_BUSY_SET) {
+					qp->s_flags &= ~RVT_S_BUSY;
+					priv->s_flags &=
+						~(HFI1_S_TID_BUSY_SET |
+						  RVT_S_BUSY);
+				} else {
+					priv->s_flags &= ~RVT_S_BUSY;
+				}
+				hfi1_schedule_tid_send(qp);
+			}
 			spin_unlock_irqrestore(&qp->s_lock, ps->flags);
 			this_cpu_inc(*ps->ppd->dd->send_schedule);
 			trace_hfi1_rc_expired_time_slice(qp, true);
@@ -903,6 +918,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+				qp->s_flags |= RVT_S_BUSY;
 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 			/*
 			 * If the packet cannot be sent now, return and
@@ -912,7 +929,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 				return;
 
 			/* allow other tasks to run */
-			if (schedule_send_yield(qp, &ps))
+			if (hfi1_schedule_send_yield(qp, &ps, false))
 				return;
 
 			spin_lock_irqsave(&qp->s_lock, ps.flags);
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 207ad9a..f18bab0 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -152,6 +152,8 @@ static u32 mask_generation(u32 a)
 
 static u32 tid_rdma_flow_wt;
 
+static void hfi1_do_tid_send(struct rvt_qp *qp);
+
 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
 					  u32 psn, u16 *fidx)
 {
@@ -180,6 +182,17 @@ static u8 trdma_pset_order(struct tid_rdma_pageset *s)
 	return ilog2(count) + 1;
 }
 
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return !(priv->s_flags & RVT_S_BUSY ||
+		 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+		(verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+		 (priv->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
 	return
@@ -2059,6 +2072,52 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 	return true;
 }
 
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+				   priv->s_sde ?
+				   priv->s_sde->cpu :
+				   cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ *  false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (hfi1_send_tid_ok(qp)) {
+	/*
+	 * The following call returns true if the qp is not on the
+	 * queue and false if the qp is already on the queue before
+	 * this call. Either way, the qp will be on the queue when the
+	 * call returns.
+	 */
+		_hfi1_schedule_tid_send(qp);
+		return true;
+	}
+	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+				IOWAIT_PENDING_TID);
+	return false;
+}
+
 /**
  * qp_to_rcd - determine the receive context used by a qp
  * @qp - the qp
@@ -2204,6 +2263,12 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
 	return dd->verbs_dev.n_tidwait;
 }
 
+static int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+	__must_hold(&qp->s_lock)
+{
+	return 0;
+}
+
 /*
  *  "Rewind" the TID request information.
  *  This means that we reset the state back to ACTIVE,
@@ -2265,6 +2330,82 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
 	req->state = TID_REQUEST_ACTIVE;
 }
 
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+	struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+	hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_pkt_state ps;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	ps.dev = to_idev(qp->ibqp.device);
+	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ps.ppd = ppd_from_ibp(ps.ibp);
+	ps.wait = iowait_get_tid_work(&priv->s_iowait);
+	ps.in_thread = false;
+	ps.timeout_int = qp->timeout_jiffies / 8;
+
+	trace_hfi1_rc_do_tid_send(qp, false);
+
+	spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!hfi1_send_tid_ok(qp)) {
+		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+		return;
+	}
+
+	priv->s_flags |= RVT_S_BUSY;
+
+	ps.timeout = jiffies + ps.timeout_int;
+	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+		cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+	ps.pkts_sent = false;
+
+	/* insure a pre-built packet is handled  */
+	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags |= RVT_S_BUSY;
+				ps.wait = iowait_get_ib_work(&priv->s_iowait);
+			}
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (hfi1_verbs_send(qp, &ps))
+				return;
+
+			/* allow other tasks to run */
+			if (hfi1_schedule_send_yield(qp, &ps, true))
+				return;
+
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+				ps.wait = iowait_get_tid_work(&priv->s_iowait);
+				if (iowait_flag_set(&priv->s_iowait,
+						    IOWAIT_PENDING_IB))
+					hfi1_schedule_send(qp);
+			}
+		}
+	} while (hfi1_make_tid_rdma_pkt(qp, &ps));
+	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
 				    struct ib_other_headers *ohdr, u32 *bth1,
 				    u32 *bth2, u32 *len)
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index ad463b1..77b0ea9 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -285,6 +285,8 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 			      struct hfi1_pportdata *ppd,
 			      struct hfi1_packet *packet);
 
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		      struct ib_qp_init_attr *init_attr);
 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
@@ -292,6 +294,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
 			       u32 *bth2);
 
+void _hfi1_do_tid_send(struct work_struct *work);
 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
 
 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c57af3b..a050bc2 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -839,6 +839,12 @@
 );
 
 DEFINE_EVENT(
+	hfi1_do_send_template, hfi1_rc_do_tid_send,
+	TP_PROTO(struct rvt_qp *qp, bool flag),
+	TP_ARGS(qp, flag)
+);
+
+DEFINE_EVENT(/* expired time slice */
 	hfi1_do_send_template, hfi1_rc_expired_time_slice,
 	TP_PROTO(struct rvt_qp *qp, bool flag),
 	TP_ARGS(qp, flag)
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index ea5d31a..d3aba66 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -172,7 +172,9 @@ struct hfi1_qp_priv {
 	u8 hdr_type; /* 9B or 16B */
 	unsigned long tid_timer_timeout_jiffies;
 	unsigned long tid_retry_timeout_jiffies;
+	/* variables for the TID RDMA SE state machine */
 	u8 s_retry;
+	u32 s_flags;
 	/* For TID RDMA READ */
 	u32 tid_r_reqs;         /* Num of tid reads requested */
 	u32 tid_r_comp;         /* Num of tid reads completed */
@@ -414,6 +416,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 			  u32 bth0, u32 bth1, u32 bth2, int middle,
 			  struct hfi1_pkt_state *ps);
 
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid);
+
 void _hfi1_do_send(struct work_struct *work);
 
 void hfi1_do_send_from_rvt(struct rvt_qp *qp);

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux