Re: [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Thu, 31 Jan 2019 13:19:31 +0000

On 30/01/2019 02:19, Chris Wilson wrote:
Having introduced per-context seqno, we now have a means to identity
progress across the system without feel of rollback as befell the
global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
advance of submission safe in the knowledge that our target seqno and
address is stable.

However, since we are telling the GPU to busy-spin on the target address
until it matches the signaling seqno, we only want to do so when we are
sure that busy-spin will be completed quickly. To achieve this we only
submit the request to HW once the signaler is itself executing (modulo
preemption causing us to wait longer), and we only do so for default and
above priority requests (so that idle priority tasks never themselves
hog the GPU waiting for others).

It could be milliseconds though. I think apart from media-bench saying 
this is faster, we would need to look at performance per Watt as well.

RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
idea how to tune it.

Eventually, do we dare adding this without a runtime switch? (There, I 
mentioned the taboo.)

What about signal mode and handling this via context switches?

But what AB-BA deadlocks? If you remove B, there can be no deadlock...
The issue is that with a deep ELSP queue, we can queue up a pair of
AB-BA on different engines, thus forming a classic mutual exclusion
deadlock. We side-step that issue by restricting the queue depth to
avoid having multiple semaphores in flight and so we only ever take one
set of locks at a time.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_request.c       | 153 +++++++++++++++++++++-
  drivers/gpu/drm/i915/i915_request.h       |   1 +
  drivers/gpu/drm/i915/i915_scheduler.c     |   1 +
  drivers/gpu/drm/i915/i915_scheduler.h     |   1 +
  drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
  drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
  drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
  drivers/gpu/drm/i915/intel_lrc.c          |  14 +-
  8 files changed, 178 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 07e4c3c68ecd..6d825cd28ae6 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,8 +22,9 @@
   *
   */
  
-#include <linux/prefetch.h>
  #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
  #include <linux/sched.h>
  #include <linux/sched/clock.h>
  #include <linux/sched/signal.h>
@@ -326,6 +327,76 @@ void i915_request_retire_upto(struct i915_request *rq)
  	} while (tmp != rq);
  }
  
+struct execute_cb {
+	struct list_head link;
+	struct irq_work work;
+	struct i915_sw_fence *fence;
+};
+
+static void irq_execute_cb(struct irq_work *wrk)
+{
+	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+	i915_sw_fence_complete(cb->fence);
+	kfree(cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+	struct execute_cb *cb;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (list_empty(&rq->execute_cb))
+		return;
+
+	list_for_each_entry(cb, &rq->execute_cb, link)
+		irq_work_queue(&cb->work);
+
+	/*
+	 * XXX Rollback on __i915_request_unsubmit()
+	 *
+	 * In the future, perhaps when we have an active time-slicing scheduler,
+	 * it will be interesting to unsubmit parallel execution and remove
+	 * busywaits from the GPU until their master is restarted. This is
+	 * quite hairy, we have to carefully rollback the fence and do a
+	 * preempt-to-idle cycle on the target engine, all the while the
+	 * master execute_cb may refire.
+	 */
+	INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+			     struct i915_request *signal,
+			     gfp_t gfp)
+{
+	struct execute_cb *cb;
+	unsigned long flags;
+
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags))
+		return 0;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = &rq->submit;
+	i915_sw_fence_await(cb->fence);
+	init_irq_work(&cb->work, irq_execute_cb);
+
+	spin_lock_irqsave(&signal->lock, flags);
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) {
+		i915_sw_fence_complete(cb->fence);
+		kfree(cb);
+	} else {
+		list_add_tail(&cb->link, &signal->execute_cb);
+	}
+	spin_unlock_irqrestore(&signal->lock, flags);
+
+	return 0;
+}
+
  static void move_to_timeline(struct i915_request *request,
  			     struct i915_timeline *timeline)
  {
@@ -373,6 +444,7 @@ void __i915_request_submit(struct i915_request *request)
  	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
  	    !i915_request_enable_breadcrumb(request))
  		intel_engine_queue_breadcrumbs(engine);
+	__notify_execute_cb(request);
  	spin_unlock(&request->lock);
  
  	engine->emit_fini_breadcrumb(request,
@@ -613,6 +685,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
  	}
  
  	INIT_LIST_HEAD(&rq->active_list);
+	INIT_LIST_HEAD(&rq->execute_cb);
  
  	tl = ce->ring->timeline;
  	ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -700,6 +773,81 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
  	return ERR_PTR(ret);
  }
  
+static int
+emit_semaphore_wait(struct i915_request *to,
+		    struct i915_request *from,
+		    gfp_t gfp)
+{
+	u32 *cs;
+	int err;
+
+	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+
+	err = i915_timeline_read_lock(from->timeline, to);
+	if (err)
+		return err;
+
+	/*
+	 * If we know our signaling request has started, we know that it
+	 * must, at least, have passed its initial breadcrumb and that its
+	 * seqno can only increase, therefore any change in its breadcrumb
+	 * must indicate completion. By using a "not equal to start" compare
+	 * we avoid the murky issue of how to handle seqno wraparound in an
+	 * async environment (short answer, we must stop the world whenever
+	 * any context wraps!) as the likelihood of missing one request then
+	 * seeing the same start value for a new request is 1 in 2^31, and
+	 * even then we know that the new request has started and is in
+	 * progress, so we are sure it will complete soon enough (not to
+	 * worry about).
+	 */
+	if (i915_request_started(from)) {
+		cs = intel_ring_begin(to, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_NEQ_SDD;
+		*cs++ = from->fence.seqno - 1;
+		*cs++ = from->timeline->hwsp_offset;
+		*cs++ = 0;
+
+		intel_ring_advance(to, cs);
+	} else {
+		int err;
+
+		err = i915_request_await_execution(to, from, gfp);
+		if (err)
+			return err;
+
+		cs = intel_ring_begin(to, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		/*
+		 * Using greater-than-or-equal here means we have to worry
+		 * about seqno wraparound. To side step that issue, we swap
+		 * the timeline HWSP upon wrapping, so that everyone listening
+		 * for the old (pre-wrap) values do not see the much smaller
+		 * (post-wrap) values than they were expecting (and so wait
+		 * forever).
+		 */
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = from->fence.seqno;
+		*cs++ = from->timeline->hwsp_offset;
+		*cs++ = 0;
+
+		intel_ring_advance(to, cs);
+	}

Would it not work to have a single path which emits the wait on NEQ 
from->fence.seqno - 1, just i915_request_await_execution conditional on 
i915_request_started?

In the !started case, having added the await, we would know the 
semaphore wait would not run until after the dependency has started, and 
NEQ would be true when it completes. The same as the above started path.

+
+	to->sched.semaphore = true;
+	return 0;
+}
+
  static int
  i915_request_await_request(struct i915_request *to, struct i915_request *from)
  {
@@ -723,6 +871,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
  		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
  						       &from->submit,
  						       I915_FENCE_GFP);
+	} else if (HAS_EXECLISTS(to->i915) &&
+		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
  	} else {
  		ret = i915_sw_fence_await_dma_fence(&to->submit,
  						    &from->fence, 0,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 40f3e8dcbdd5..66a374ee177a 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -127,6 +127,7 @@ struct i915_request {
  	 */
  	struct i915_sw_fence submit;
  	wait_queue_entry_t submitq;
+	struct list_head execute_cb;
  
  	/*
  	 * A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index d01683167c77..aa6c663dca09 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -29,6 +29,7 @@ void i915_sched_node_init(struct i915_sched_node *node)
  	INIT_LIST_HEAD(&node->waiters_list);
  	INIT_LIST_HEAD(&node->link);
  	node->attr.priority = I915_PRIORITY_INVALID;
+	node->semaphore = false;
  }
  
  static struct i915_dependency *
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index dbe9cb7ecd82..d764cf10536f 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -72,6 +72,7 @@ struct i915_sched_node {
  	struct list_head waiters_list; /* those after us, they depend upon us */
  	struct list_head link;
  	struct i915_sched_attr attr;
+	bool semaphore;
  };
  
  struct i915_dependency {
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 7c58b049ecb5..8d1400d378d7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
  	__i915_sw_fence_notify(fence, FENCE_FREE);
  }
  
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
  {
  	debug_fence_assert(fence);
  
@@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
  	__i915_sw_fence_complete(fence, NULL);
  }
  
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
  {
  	debug_fence_assert(fence);
  	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 0e055ea0179f..6dec9e1d1102 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
  				    unsigned long timeout,
  				    gfp_t gfp);
  
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
  static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
  {
  	return atomic_read(&fence->pending) <= 0;
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index b96a31bc1080..0efaadd3bc32 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -106,7 +106,12 @@
  #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
  #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
  #define   MI_SEMAPHORE_POLL		(1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
  #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
  #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
  #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e97ce54138d3..80d17b75b2e6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -421,7 +421,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
  	 * in the priority queue, but they will not gain immediate access to
  	 * the GPU.
  	 */
-	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) {
+	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY &&
+	    i915_request_started(active)) {
  		prio |= ACTIVE_PRIORITY;
  		active->sched.attr.priority = prio;
  		list_move_tail(&active->sched.link,
@@ -605,6 +606,17 @@ static bool can_merge_rq(const struct i915_request *prev,
  {
  	GEM_BUG_ON(!assert_priority_queue(prev, next));
  
+	/*
+	 * To avoid AB-BA deadlocks, we simply restrict ourselves to only
+	 * submitting one semaphore (think HW spinlock) to HW at a time. This
+	 * prevents the execution callback on a later sempahore from being
+	 * queued on another engine, so no cycle can be formed. Preemption
+	 * rules should mean that if this semaphore is preempted, its
+	 * dependency chain is preserved and suitably promoted via PI.
+	 */
+	if (prev->sched.semaphore && !i915_request_started(prev))
+		return false;
+
  	if (!can_merge_ctx(prev->hw_context, next->hw_context))
  		return false;
  


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx