Re: [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay

Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx> · Tue, 11 Feb 2020 17:08:30 -0800

On 2/10/20 12:57 PM, Chris Wilson wrote:
To prevent the context from proceeding past the end of the request as we
unwind, we embed a semaphore into the footer of each request. (If the
context were to skip past the end of the request as we perform the
preemption, next time we reload the context it's RING_HEAD would be past
the RING_TAIL and instead of replaying the commands it would read the
read of the uninitialised ringbuffer.)

However, this requires us to keep the ring paused at the end of the
request until we have a change to process the preemption ack and remove
the semaphore. Our processing of acks is at the whim of ksoftirqd, and
so it is entirely possible that the GPU has to wait for the tasklet
before it can proceed with the next request.

It was suggested that we could also embed a MI_LOAD_REGISTER_MEM into
the footer to read the current RING_TAIL from the context, which would
allow us to not only avoid this round trip (and so release the context
as soon as we had submitted the preemption request to in ELSP), but also
skip using ELSP for lite-restores entirely. That has the nice benefit of
dramatically reducing contention and the frequency of interrupts when a
client submits two or more execbufs in rapid succession.

However, mmio access to RING_TAIL was defeatured in gen11 so we can only
employ this handy trick for gen8/gen9.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx>
---
  drivers/gpu/drm/i915/gt/intel_engine_types.h | 23 +++--
  drivers/gpu/drm/i915/gt/intel_lrc.c          | 99 +++++++++++++++++++-
  2 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 24cff658e6e5..ae8724915320 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -488,14 +488,15 @@ struct intel_engine_cs {
  	/* status_notifier: list of callbacks for context-switch changes */
  	struct atomic_notifier_head context_status_notifier;
  
-#define I915_ENGINE_USING_CMD_PARSER BIT(0)
-#define I915_ENGINE_SUPPORTS_STATS   BIT(1)
-#define I915_ENGINE_HAS_PREEMPTION   BIT(2)
-#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
-#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
-#define I915_ENGINE_IS_VIRTUAL       BIT(5)
-#define I915_ENGINE_HAS_RELATIVE_MMIO BIT(6)
-#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7)
+#define I915_ENGINE_REQUIRES_CMD_PARSER		BIT(0)
+#define I915_ENGINE_USING_CMD_PARSER		BIT(1)
+#define I915_ENGINE_SUPPORTS_STATS		BIT(2)
+#define I915_ENGINE_HAS_PREEMPTION		BIT(3)
+#define I915_ENGINE_HAS_SEMAPHORES		BIT(4)
+#define I915_ENGINE_HAS_TAIL_LRM		BIT(5)
+#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET	BIT(6)
+#define I915_ENGINE_IS_VIRTUAL			BIT(7)
+#define I915_ENGINE_HAS_RELATIVE_MMIO		BIT(8)
  	unsigned int flags;
  
  	/*
@@ -592,6 +593,12 @@ intel_engine_has_semaphores(const struct intel_engine_cs *engine)
  	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
  }
  
+static inline bool
+intel_engine_has_tail_lrm(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_TAIL_LRM;
+}
+
  static inline bool
  intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine)
  {
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 696f0b6b223c..5939672781fb 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1797,6 +1797,74 @@ static inline void clear_ports(struct i915_request **ports, int count)
  	memset_p((void **)ports, NULL, count);
  }
  
+static struct i915_request *
+skip_lite_restore(struct intel_engine_cs *const engine,
+		  struct i915_request *first,
+		  bool *submit)
+{
+	struct intel_engine_execlists *const execlists = &engine->execlists;
+	struct i915_request *last = first;
+	struct rb_node *rb;
+
+	if (!intel_engine_has_tail_lrm(engine))
+		return last;
+
+	GEM_BUG_ON(*submit);
+	while ((rb = rb_first_cached(&execlists->queue))) {
+		struct i915_priolist *p = to_priolist(rb);
+		struct i915_request *rq, *rn;
+		int i;
+
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			if (!can_merge_rq(last, rq))
+				goto out;
+
+			if (__i915_request_submit(rq)) {
+				*submit = true;
+				last = rq;
+			}
+		}
+
+		rb_erase_cached(&p->node, &execlists->queue);
+		i915_priolist_free(p);
+	}
+out:
+	if (*submit) {
+		ring_set_paused(engine, 1);
+
+		/*
+		 * If we are quick and the current context hasn't yet completed
+		 * its request, we can just tell it to extend the RING_TAIL
+		 * onto the next without having to submit a new ELSP.
+		 */
+		if (!i915_request_completed(first)) {
+			struct i915_request **port;
+
+			ENGINE_TRACE(engine,
+				     "eliding lite-restore last=%llx:%lld->%lld, current %d\n",
+				     first->fence.context,
+				     first->fence.seqno,
+				     last->fence.seqno,
+				     hwsp_seqno(last));
+			GEM_BUG_ON(first->context != last->context);
+
+			execlists_update_context(last);
+			for (port = (struct i915_request **)execlists->active;
+			     *port != first;
+			     port++)
+				;
+			WRITE_ONCE(*port, i915_request_get(last));
+			i915_request_put(first);
+
+			*submit = false;
+		}
+
+		ring_set_paused(engine, 0);
+	}
+
+	return last;
+}
+
  static void execlists_dequeue(struct intel_engine_cs *engine)
  {
  	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1934,6 +2002,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
  
  				return;
  			}
+
+			last = skip_lite_restore(engine, last, &submit);
  		}
  	}
  
@@ -2155,10 +2225,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
  		WRITE_ONCE(execlists->yield, -1);
  		execlists_submit_ports(engine);
  		set_preempt_timeout(engine);
-	} else {
+	}
+
+	if (intel_engine_has_tail_lrm(engine) || !submit)

Why do we skip here if intel_engine_has_tail_lrm is true? I see that if 
we have work pending we either take the skip_lite_restore() or the 
submit path above, but I can't see why we need to explicitly skip 
re-starting the ring.

  skip_submit:
  		ring_set_paused(engine, 0);
-	}
  }
  
  static void
@@ -2325,7 +2396,8 @@ static void process_csb(struct intel_engine_cs *engine)
  
  			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
  
-			ring_set_paused(engine, 0);
+			if (!intel_engine_has_tail_lrm(engine))
+				ring_set_paused(engine, 0);
  

here as well, although I'm assuming it has the same explanation as the 
one above.

Daniele

  			/* Point active to the new ELSP; prevent overwriting */
  			WRITE_ONCE(execlists->active, execlists->pending);
@@ -4123,15 +4195,28 @@ static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
  	return cs;
  }
  
+static u32 *emit_lrm_tail(struct i915_request *request, u32 *cs)
+{
+	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TAIL(request->engine->mmio_base));
+	*cs++ = i915_ggtt_offset(request->context->state) +
+		LRC_STATE_PN * PAGE_SIZE +
+		CTX_RING_TAIL * sizeof(u32);
+	*cs++ = 0;
+
+	return cs;
+}
+
  static __always_inline u32*
-gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
-				 u32 *cs)
+gen8_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
  {
  	*cs++ = MI_USER_INTERRUPT;
  
  	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  	if (intel_engine_has_semaphores(request->engine))
  		cs = emit_preempt_busywait(request, cs);
+	if (intel_engine_has_tail_lrm(request->engine))
+		cs = emit_lrm_tail(request, cs);
  
  	request->tail = intel_ring_offset(request, cs);
  	assert_ring_tail_valid(request->ring, request->tail);
@@ -4220,6 +4305,8 @@ static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
  static __always_inline u32*
  gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
  {
+	GEM_BUG_ON(intel_engine_has_tail_lrm(request->engine));
+
  	*cs++ = MI_USER_INTERRUPT;
  
  	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
@@ -4286,6 +4373,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
  		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
  		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
  			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+		if (INTEL_GEN(engine->i915) < 11)
+			engine->flags |= I915_ENGINE_HAS_TAIL_LRM;
  	}
  
  	if (INTEL_GEN(engine->i915) >= 12)

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx