Re: [PATCH 20/22] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Tue, 5 Feb 2019 09:24:24 +0000

On 04/02/2019 13:22, Chris Wilson wrote:
Having introduced per-context seqno, we now have a means to identity
progress across the system without feel of rollback as befell the
global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
advance of submission safe in the knowledge that our target seqno and
address is stable.

However, since we are telling the GPU to busy-spin on the target address
until it matches the signaling seqno, we only want to do so when we are
sure that busy-spin will be completed quickly. To achieve this we only
submit the request to HW once the signaler is itself executing (modulo
preemption causing us to wait longer), and we only do so for default and
above priority requests (so that idle priority tasks never themselves
hog the GPU waiting for others).

v3: Drop the older NEQ branch, now we pin the signaler's HWSP anyway.
v4: Tell the world and include it as part of scheduler caps.

Looks okay to me.

Just a paragraph about power and performance, with ideally a latest 
table of results from media-bench, would be the usual requirement for 
these kind of additions.

Regards,

Tvrtko


Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_drv.c           |   2 +-
  drivers/gpu/drm/i915/i915_request.c       | 136 +++++++++++++++++++++-
  drivers/gpu/drm/i915/i915_request.h       |   1 +
  drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
  drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
  drivers/gpu/drm/i915/intel_engine_cs.c    |   1 +
  drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
  drivers/gpu/drm/i915/intel_lrc.c          |   1 +
  drivers/gpu/drm/i915/intel_ringbuffer.h   |   7 ++
  include/uapi/drm/i915_drm.h               |   1 +
  10 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index a7aaa1ac4c99..7e38e2b61a2e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -349,7 +349,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
  		value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
  		break;
  	case I915_PARAM_HAS_SEMAPHORES:
-		value = 0;
+		value = !!(dev_priv->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
  		break;
  	case I915_PARAM_HAS_SECURE_BATCHES:
  		value = capable(CAP_SYS_ADMIN);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 057bffa56700..116bd9648db7 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,8 +22,9 @@
   *
   */
  
-#include <linux/prefetch.h>
  #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
  #include <linux/sched.h>
  #include <linux/sched/clock.h>
  #include <linux/sched/signal.h>
@@ -32,9 +33,16 @@
  #include "i915_active.h"
  #include "i915_reset.h"
  
+struct execute_cb {
+	struct list_head link;
+	struct irq_work work;
+	struct i915_sw_fence *fence;
+};
+
  static struct i915_global_request {
  	struct kmem_cache *slab_requests;
  	struct kmem_cache *slab_dependencies;
+	struct kmem_cache *slab_execute_cbs;
  } global;
  
  static const char *i915_fence_get_driver_name(struct dma_fence *fence)
@@ -331,6 +339,69 @@ void i915_request_retire_upto(struct i915_request *rq)
  	} while (tmp != rq);
  }
  
+static void irq_execute_cb(struct irq_work *wrk)
+{
+	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+	i915_sw_fence_complete(cb->fence);
+	kmem_cache_free(global.slab_execute_cbs, cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+	struct execute_cb *cb;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (list_empty(&rq->execute_cb))
+		return;
+
+	list_for_each_entry(cb, &rq->execute_cb, link)
+		irq_work_queue(&cb->work);
+
+	/*
+	 * XXX Rollback on __i915_request_unsubmit()
+	 *
+	 * In the future, perhaps when we have an active time-slicing scheduler,
+	 * it will be interesting to unsubmit parallel execution and remove
+	 * busywaits from the GPU until their master is restarted. This is
+	 * quite hairy, we have to carefully rollback the fence and do a
+	 * preempt-to-idle cycle on the target engine, all the while the
+	 * master execute_cb may refire.
+	 */
+	INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+			     struct i915_request *signal,
+			     gfp_t gfp)
+{
+	struct execute_cb *cb;
+
+	if (i915_request_is_active(signal))
+		return 0;
+
+	cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = &rq->submit;
+	i915_sw_fence_await(cb->fence);
+	init_irq_work(&cb->work, irq_execute_cb);
+
+	spin_lock_irq(&signal->lock);
+	if (i915_request_is_active(signal)) {
+		i915_sw_fence_complete(cb->fence);
+		kmem_cache_free(global.slab_execute_cbs, cb);
+	} else {
+		list_add_tail(&cb->link, &signal->execute_cb);
+	}
+	spin_unlock_irq(&signal->lock);
+
+	return 0;
+}
+
  static void move_to_timeline(struct i915_request *request,
  			     struct i915_timeline *timeline)
  {
@@ -389,6 +460,7 @@ void __i915_request_submit(struct i915_request *request)
  	 */
  	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
  	request->sched.attr.priority |= __NO_PREEMPTION;
+	__notify_execute_cb(request);
  
  	spin_unlock(&request->lock);
  
@@ -630,6 +702,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
  	}
  
  	INIT_LIST_HEAD(&rq->active_list);
+	INIT_LIST_HEAD(&rq->execute_cb);
  
  	tl = ce->ring->timeline;
  	ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -717,6 +790,51 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
  	return ERR_PTR(ret);
  }
  
+static int
+emit_semaphore_wait(struct i915_request *to,
+		    struct i915_request *from,
+		    gfp_t gfp)
+{
+	u32 *cs;
+	int err;
+
+	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
+
+	/* We need to pin the signaler's HWSP until we are finished reading. */
+	err = i915_timeline_read_lock(from->timeline, to);
+	if (err)
+		return err;
+
+	/* Only submit our spinner after the signaler is running! */
+	err = i915_request_await_execution(to, from, gfp);
+	if (err)
+		return err;
+
+	cs = intel_ring_begin(to, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/*
+	 * Using greater-than-or-equal here means we have to worry
+	 * about seqno wraparound. To side step that issue, we swap
+	 * the timeline HWSP upon wrapping, so that everyone listening
+	 * for the old (pre-wrap) values do not see the much smaller
+	 * (post-wrap) values than they were expecting (and so wait
+	 * forever).
+	 */
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		MI_SEMAPHORE_SAD_GTE_SDD;
+	*cs++ = from->fence.seqno;
+	*cs++ = from->timeline->hwsp_offset;
+	*cs++ = 0;
+
+	intel_ring_advance(to, cs);
+	return 0;
+}
+
  static int
  i915_request_await_request(struct i915_request *to, struct i915_request *from)
  {
@@ -738,6 +856,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
  		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
  						       &from->submit,
  						       I915_FENCE_GFP);
+	} else if (intel_engine_has_semaphores(to->engine) &&
+		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
  	} else {
  		ret = i915_sw_fence_await_dma_fence(&to->submit,
  						    &from->fence, 0,
@@ -1212,14 +1333,23 @@ int i915_global_request_init(void)
  	if (!global.slab_requests)
  		return -ENOMEM;
  
+	global.slab_execute_cbs = KMEM_CACHE(execute_cb,
+					     SLAB_HWCACHE_ALIGN |
+					     SLAB_RECLAIM_ACCOUNT |
+					     SLAB_TYPESAFE_BY_RCU);
+	if (!global.slab_execute_cbs)
+		goto err_requests;
+
  	global.slab_dependencies = KMEM_CACHE(i915_dependency,
  					      SLAB_HWCACHE_ALIGN |
  					      SLAB_RECLAIM_ACCOUNT);
  	if (!global.slab_dependencies)
-		goto err_requests;
+		goto err_execute_cbs;
  
  	return 0;
  
+err_execute_cbs:
+	kmem_cache_destroy(global.slab_execute_cbs);
  err_requests:
  	kmem_cache_destroy(global.slab_requests);
  	return -ENOMEM;
@@ -1228,11 +1358,13 @@ int i915_global_request_init(void)
  void i915_global_request_shrink(void)
  {
  	kmem_cache_shrink(global.slab_dependencies);
+	kmem_cache_shrink(global.slab_execute_cbs);
  	kmem_cache_shrink(global.slab_requests);
  }
  
  void i915_global_request_exit(void)
  {
  	kmem_cache_destroy(global.slab_dependencies);
+	kmem_cache_destroy(global.slab_execute_cbs);
  	kmem_cache_destroy(global.slab_requests);
  }
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 071ff1064579..df52776b26cf 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -128,6 +128,7 @@ struct i915_request {
  	 */
  	struct i915_sw_fence submit;
  	wait_queue_entry_t submitq;
+	struct list_head execute_cb;
  
  	/*
  	 * A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 7c58b049ecb5..8d1400d378d7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
  	__i915_sw_fence_notify(fence, FENCE_FREE);
  }
  
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
  {
  	debug_fence_assert(fence);
  
@@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
  	__i915_sw_fence_complete(fence, NULL);
  }
  
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
  {
  	debug_fence_assert(fence);
  	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 0e055ea0179f..6dec9e1d1102 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
  				    unsigned long timeout,
  				    gfp_t gfp);
  
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
  static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
  {
  	return atomic_read(&fence->pending) <= 0;
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 0dbd6d7c1693..30a308ebbc89 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -621,6 +621,7 @@ void intel_engines_set_scheduler_caps(struct drm_i915_private *i915)
  		u32 sched_cap;
  	} map[] = {
  		{ I915_ENGINE_HAS_PREEMPTION, I915_SCHEDULER_CAP_PREEMPTION },
+		{ I915_ENGINE_HAS_SEMAPHORES, I915_SCHEDULER_CAP_SEMAPHORES },
  		{ I915_ENGINE_SUPPORTS_STATS, I915_SCHEDULER_CAP_PMU },
  	};
  	struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index b96a31bc1080..0efaadd3bc32 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -106,7 +106,12 @@
  #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
  #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
  #define   MI_SEMAPHORE_POLL		(1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
  #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
  #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
  #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 66d465708bc6..ea813f88fbb3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2307,6 +2307,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
  	engine->park = NULL;
  	engine->unpark = NULL;
  
+	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
  	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
  	if (engine->i915->preempt_context)
  		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5dffccb6740e..c9cd60444987 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -496,6 +496,7 @@ struct intel_engine_cs {
  #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
  #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
  #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
  	unsigned int flags;
  
  	/*
@@ -573,6 +574,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
  	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
  }
  
+static inline bool
+intel_engine_has_semaphores(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
+}
+
  void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);
  
  static inline bool __execlists_need_preempt(int prio, int last)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index d8ac7f105734..4b8a07d774b4 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -477,6 +477,7 @@ typedef struct drm_i915_irq_wait {
  #define   I915_SCHEDULER_CAP_PRIORITY	(1ul << 1)
  #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
  #define   I915_SCHEDULER_CAP_PMU	(1ul << 3)
+#define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 4)
  
  #define I915_PARAM_HUC_STATUS		 42
  

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx