Re: [PATCH] drm/i915/execlists: Enable coarse preemption boundaries for gen8

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Fri, 5 Apr 2019 10:46:19 +0100

On 29/03/2019 13:40, Chris Wilson wrote:
When we introduced preemption, we chose to keep it disabled for gen8 as
supporting preemption inside GPGPU user batches required various w/a in
userspace. Since then, the desire to preempt long queues of requests
between batches (e.g. within busywaiting semaphores) has grown. So allow
arbitration within the busywaits and between requests, but disable
arbitration within user batches so that we can preempt between requests
and not risk breaking GPGPU.

However, since this preemption is much coarser and doesn't interfere
with userspace, we decline to include it amongst the scheduler
capabilities. (This is also required for us to skip over the preemption
selftests that expect to be able to preempt user batches.)

Testcase: igt/gem_exec_scheduler/semaphore-user
References: beecec901790 ("drm/i915/execlists: Preemption!")
Fixes: e88619646971 ("drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+")
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Michal Winiarski <michal.winiarski@xxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_gem_context.c    |   2 +-
  drivers/gpu/drm/i915/intel_lrc.c           |  49 ++++--
  drivers/gpu/drm/i915/selftests/intel_lrc.c | 180 +++++++++++++++++++++
  3 files changed, 217 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 662da485e15f..92993db38aad 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -562,7 +562,7 @@ static void init_contexts(struct drm_i915_private *i915)
  
  static bool needs_preempt_context(struct drm_i915_private *i915)
  {
-	return HAS_LOGICAL_RING_PREEMPTION(i915);
+	return HAS_EXECLISTS(i915);
  }
  
  int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index bec232acc8d7..b380688a0c1c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -233,7 +233,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
  {
  	int last_prio;
  
-	if (!intel_engine_has_preemption(engine))
+	if (!engine->preempt_context)
  		return false;
  
  	if (i915_request_completed(rq))
@@ -2035,7 +2035,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
  {
  	u32 *cs;
  
-	cs = intel_ring_begin(rq, 6);
+	cs = intel_ring_begin(rq, 4);
  	if (IS_ERR(cs))
  		return PTR_ERR(cs);
  
@@ -2046,16 +2046,35 @@ static int gen8_emit_bb_start(struct i915_request *rq,
  	 * particular all the gen that do not need the w/a at all!), if we
  	 * took care to make sure that on every switch into this context
  	 * (both ordinary and for preemption) that arbitrartion was enabled
-	 * we would be fine. However, there doesn't seem to be a downside to
-	 * being paranoid and making sure it is set before each batch and
-	 * every context-switch.
-	 *
-	 * Note that if we fail to enable arbitration before the request
-	 * is complete, then we do not see the context-switch interrupt and
-	 * the engine hangs (with RING_HEAD == RING_TAIL).
-	 *
-	 * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
+	 * we would be fine.  However, for gen8 there is another w/a that
+	 * requires us to not preempt inside GPGPU execution, so we keep
+	 * arbitration disabled for gen8 batches. Arbitration will be
+	 * re-enabled before we close the request
+	 * (engine->emit_fini_breadcrumb).
  	 */
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+	/* FIXME(BDW): Address space and security selectors. */
+	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
+		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
+	*cs++ = lower_32_bits(offset);
+	*cs++ = upper_32_bits(offset);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int gen9_emit_bb_start(struct i915_request *rq,
+			      u64 offset, u32 len,
+			      const unsigned int flags)
+{
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
  	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  
  	/* FIXME(BDW): Address space and security selectors. */

This comment can now be removed.

@@ -2316,7 +2335,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
  	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
  	if (!intel_vgpu_active(engine->i915))
  		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
-	if (engine->preempt_context)
+	if (engine->preempt_context &&
+	    HAS_LOGICAL_RING_PREEMPTION(engine->i915))
  		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
  }
  
@@ -2350,7 +2370,10 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
  		 * until a more refined solution exists.
  		 */
  	}
-	engine->emit_bb_start = gen8_emit_bb_start;
+	if (IS_GEN(engine->i915, 8))
+		engine->emit_bb_start = gen8_emit_bb_start;
+	else
+		engine->emit_bb_start = gen9_emit_bb_start;
  }
  
  static inline void
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 0d3cae564db8..8bcd4e1d58ee 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -76,6 +76,185 @@ static int live_sanitycheck(void *arg)
  	return err;
  }
  
+static int live_busywait_preempt(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct i915_gem_context *ctx_hi, *ctx_lo;
+	struct intel_engine_cs *engine;
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	int err = -ENOMEM;
+	u32 *map;
+
+	/*
+	 * Verify that even without HAS_LOGICAL_RING_PREEMPTION, we can
+	 * preempt the busywaits used to synchronise between rings.
+	 */
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	ctx_hi = kernel_context(i915);
+	if (!ctx_hi)
+		goto err_unlock;
+	ctx_hi->sched.priority = INT_MAX;
+
+	ctx_lo = kernel_context(i915);
+	if (!ctx_lo)
+		goto err_ctx_hi;
+	ctx_lo->sched.priority = INT_MIN;
+
+	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
+	if (IS_ERR(obj)) {
+		err = PTR_ERR(obj);
+		goto err_ctx_lo;
+	}
+
+	map = i915_gem_object_pin_map(obj, I915_MAP_WC);
+	if (IS_ERR(map)) {
+		err = PTR_ERR(map);
+		goto err_obj;
+	}
+
+	vma = i915_vma_instance(obj, &i915->ggtt.vm, 0);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto err_map;
+	}
+
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+	if (err)
+		goto err_map;
+
+	for_each_engine(engine, i915, id) {
+		struct i915_request *lo, *hi;
+		struct igt_live_test t;
+		u32 *cs;
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		if (igt_live_test_begin(&t, i915, __func__, engine->name)) {
+			err = -EIO;
+			goto err_vma;
+		}
+
+		/*
+		 * We create two requests. The low priority request
+		 * busywaits on a semaphore (inside the ringbuffer where
+		 * is should be preemptible) and the high priority requests
+		 * uses a MI_STORE_DWORD_IMM to update the semaphore value
+		 * allowing the first request to complete. If preemption
+		 * fails, we hang instead.
+		 */
+
+		lo = i915_request_alloc(engine, ctx_lo);
+		if (IS_ERR(lo)) {
+			err = PTR_ERR(lo);
+			goto err_vma;
+		}
+
+		cs = intel_ring_begin(lo, 8);
+		if (IS_ERR(cs)) {
+			err = PTR_ERR(cs);
+			i915_request_add(lo);
+			goto err_vma;
+		}
+
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = i915_ggtt_offset(vma);
+		*cs++ = 0;
+		*cs++ = 1;
+
+		/* XXX Do we need a flush + invalidate here? */
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = 0;
+		*cs++ = i915_ggtt_offset(vma);
+		*cs++ = 0;
+
+		intel_ring_advance(lo, cs);
+		i915_request_add(lo);
+
+		if (wait_for(READ_ONCE(*map), 10)) {
+			err = -ETIMEDOUT;
+			goto err_vma;
+		}
+
+		/* Low priority request should be busywaiting now */
+		if (i915_request_wait(lo, I915_WAIT_LOCKED, 1) != -ETIME) {
+			pr_err("%s: Busywaiting request did not!\n",
+			       engine->name);
+			err = -EIO;
+			goto err_vma;
+		}
+
+		hi = i915_request_alloc(engine, ctx_hi);
+		if (IS_ERR(hi)) {
+			err = PTR_ERR(hi);
+			goto err_vma;

Could release the spinner with a CPU write from here onwards but I guess 
it doesn't matter.

+		}
+
+		cs = intel_ring_begin(hi, 4);
+		if (IS_ERR(cs)) {
+			err = PTR_ERR(cs);
+			i915_request_add(hi);
+			goto err_vma;
+		}
+
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = i915_ggtt_offset(vma);
+		*cs++ = 0;
+		*cs++ = 0;
+
+		intel_ring_advance(hi, cs);
+		i915_request_add(hi);
+
+		if (i915_request_wait(lo, I915_WAIT_LOCKED, HZ / 5) < 0) {
+			struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+			pr_err("%s: Failed to preempt semaphore busywait!\n",
+			       engine->name);
+
+			intel_engine_dump(engine, &p, "%s\n", engine->name);
+			GEM_TRACE_DUMP();
+
+			i915_gem_set_wedged(i915);
+			err = -EIO;
+			goto err_vma;
+		}
+		GEM_BUG_ON(READ_ONCE(*map));
+
+		if (igt_live_test_end(&t)) {
+			err = -EIO;
+			goto err_vma;
+		}
+	}
+
+	err = 0;
+err_vma:
+	i915_vma_unpin(vma);
+err_map:
+	i915_gem_object_unpin_map(obj);
+err_obj:
+	i915_gem_object_put(obj);
+err_ctx_lo:
+	kernel_context_close(ctx_lo);
+err_ctx_hi:
+	kernel_context_close(ctx_hi);
+err_unlock:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
  static int live_preempt(void *arg)
  {
  	struct drm_i915_private *i915 = arg;
@@ -1127,6 +1306,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
  {
  	static const struct i915_subtest tests[] = {
  		SUBTEST(live_sanitycheck),
+		SUBTEST(live_busywait_preempt),
  		SUBTEST(live_preempt),
  		SUBTEST(live_late_preempt),
  		SUBTEST(live_suppress_self_preempt),


Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx