Re: [PATCH] drm/i915/execlists: Don't merely skip submission if maybe timeslicing

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Fri, 18 Oct 2019 10:50:55 +0100

On 17/10/2019 10:52, Chris Wilson wrote:
Normally, we try and skip submission if ELSP[1] is filled. However, we
may desire to enable timeslicing due to the queue priority, even if
ELSP[1] itself does not require timeslicing. That is the queue is equal
priority to ELSP[0] and higher priority then ELSP[1]. Previously, we
would wait until the context switch to preempt the current ELSP[1], but
with timeslicing, we want to preempt ELSP[0] and replace it with the
queue.

Why we want to preempt ELSP[0] if the statement is queue is not higher 
priority from it?


In writing the test case, it become quickly apparent that we were also
suppressing the tasklet during promotion and so failing to notice when
the queue started requiring timeslicing.

Fixes: 2229adc81380 ("drm/i915/execlist: Trim immediate timeslice expiry")
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
Kick execlists_submission_tasklet harder.
---
  drivers/gpu/drm/i915/gt/intel_lrc.c    |  22 +++-
  drivers/gpu/drm/i915/gt/selftest_lrc.c | 164 ++++++++++++++++++++++++-
  drivers/gpu/drm/i915/i915_scheduler.c  |  17 ++-
  drivers/gpu/drm/i915/i915_scheduler.h  |  18 ---
  4 files changed, 192 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e9fe9f79cedd..d0088d020220 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -352,10 +352,15 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
  	 * However, the priority hint is a mere hint that we may need to
  	 * preempt. If that hint is stale or we may be trying to preempt
  	 * ourselves, ignore the request.
+	 *
+	 * More naturally we would write
+	 *      prio >= max(0, last);
+	 * except that we wish to prevent triggering preemption at the same
+	 * priority level: the task that is running should remain running
+	 * to preserve FIFO ordering of dependencies.
  	 */
-	last_prio = effective_prio(rq);
-	if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
-					 last_prio))
+	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
+	if (engine->execlists.queue_priority_hint <= last_prio)

Okay this eventually resolves to the exact same condition as before so 
it seems to only be churn as consequence of removing 
i915_scheduler_need_preempt. Then lets see..

  		return false;
  
  	/*
@@ -1509,8 +1514,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
  			 * submission.
  			 */
  			if (!list_is_last(&last->sched.link,
-					  &engine->active.requests))
+					  &engine->active.requests)) {
+				/*
+				 * Even if ELSP[1] is occupied and not worthy
+				 * of timeslices, our queue might be.
+				 */
+				if (!execlists->timer.expires &&
+				    need_timeslice(engine, last))
+					mod_timer(&execlists->timer,
+						  jiffies + 1);
  				return;
+			}
  
  			/*
  			 * WaIdleLiteRestore:bdw,skl
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 2868371c609e..5bcfe4a2466f 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -325,7 +325,13 @@ semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx)
  	if (IS_ERR(rq))
  		goto out_ctx;
  
-	err = emit_semaphore_chain(rq, vma, idx);
+	err = 0;
+	if (rq->engine->emit_init_breadcrumb)
+		err = rq->engine->emit_init_breadcrumb(rq);
+	if (err == 0)
+		err = emit_semaphore_chain(rq, vma, idx);
+	if (err == 0)
+		i915_request_get(rq);
  	i915_request_add(rq);
  	if (err)
  		rq = ERR_PTR(err);
@@ -338,10 +344,10 @@ semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx)
  static int
  release_queue(struct intel_engine_cs *engine,
  	      struct i915_vma *vma,
-	      int idx)
+	      int idx, int prio)
  {
  	struct i915_sched_attr attr = {
-		.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
+		.priority = prio,
  	};
  	struct i915_request *rq;
  	u32 *cs;
@@ -362,9 +368,15 @@ release_queue(struct intel_engine_cs *engine,
  	*cs++ = 1;
  
  	intel_ring_advance(rq, cs);
+
+	i915_request_get(rq);
  	i915_request_add(rq);
  
+	local_bh_disable();
  	engine->schedule(rq, &attr);
+	local_bh_enable(); /* kick tasklet */
+
+	i915_request_put(rq);
  
  	return 0;
  }
@@ -383,7 +395,6 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
  	if (IS_ERR(head))
  		return PTR_ERR(head);
  
-	i915_request_get(head);
  	for_each_engine(engine, outer->i915, id) {
  		for (i = 0; i < count; i++) {
  			struct i915_request *rq;
@@ -393,10 +404,12 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
  				err = PTR_ERR(rq);
  				goto out;
  			}
+
+			i915_request_put(rq);
  		}
  	}
  
-	err = release_queue(outer, vma, n);
+	err = release_queue(outer, vma, n, INT_MAX);
  	if (err)
  		goto out;
  
@@ -482,6 +495,146 @@ static int live_timeslice_preempt(void *arg)
  	return err;
  }
  
+static int nop_request(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	i915_request_add(rq);
+	return 0;
+}
+
+static void wait_for_submit(struct intel_engine_cs *engine)
+{
+	do {
+		cond_resched();
+		intel_engine_flush_submission(engine);
+	} while (READ_ONCE(engine->execlists.pending[0]));
+}
+
+static int live_timeslice_queue(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct drm_i915_gem_object *obj;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	struct i915_vma *vma;
+	void *vaddr;
+	int err = 0;
+
+	/*
+	 * Make sure that even if ELSP[0] and ELSP[1] are filled with
+	 * timeslicing between them disabled, we *do* enable timeslicing
+	 * if the queue demands it. (Normally, we do not submit if
+	 * ELSP[1] is already occupied, so must rely on timeslicing to
+	 * eject ELSP[0] in favour of the queue.)
+	 */
+
+	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto err_obj;
+	}
+
+	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC);
+	if (IS_ERR(vaddr)) {
+		err = PTR_ERR(vaddr);
+		goto err_obj;
+	}
+
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+	if (err)
+		goto err_map;
+
+	for_each_engine(engine, gt->i915, id) {

Make it gt now.

+		struct i915_sched_attr attr = {
+			.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
+		};
+		struct i915_request *rq;
+
+		if (!intel_engine_has_preemption(engine))
+			continue;
+
+		memset(vaddr, 0, PAGE_SIZE);
+
+		/* ELSP[0]: semaphore wait */
+		rq = semaphore_queue(engine, vma, 0);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_pin;
+		}
+		engine->schedule(rq, &attr);
+		wait_for_submit(engine);
+
+		/* ELSP[1]: nop request */
+		err = nop_request(engine);
+		if (err) {
+			i915_request_put(rq);
+			goto err_pin;
+		}
+		wait_for_submit(engine);
+
+		GEM_BUG_ON(i915_request_completed(rq));
+		GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
+
+		/* Queue: semaphore signal, matching priority as semaphore */
+		err = release_queue(engine, vma, 1, effective_prio(rq));
+		if (err) {
+			i915_request_put(rq);
+			goto err_pin;
+		}
+
+		intel_engine_flush_submission(engine);
+		if (!READ_ONCE(engine->execlists.timer.expires) &&
+		    !i915_request_completed(rq)) {
+			struct drm_printer p =
+				drm_info_printer(gt->i915->drm.dev);
+
+			GEM_TRACE_ERR("%s: Failed to enable timeslicing!\n",
+				      engine->name);
+			intel_engine_dump(engine, &p,
+					  "%s\n", engine->name);
+			GEM_TRACE_DUMP();
+
+			memset(vaddr, 0xff, PAGE_SIZE);
+			err = -EINVAL;
+		}
+
+		/* Timeslice every jiffie, so within 2 we should signal */
+		if (i915_request_wait(rq, 0, 3) < 0) {
+			struct drm_printer p =
+				drm_info_printer(gt->i915->drm.dev);
+
+			pr_err("%s: Failed to timeslice into queue\n",
+			       engine->name);
+			intel_engine_dump(engine, &p,
+					  "%s\n", engine->name);
+
+			memset(vaddr, 0xff, PAGE_SIZE);
+			err = -EIO;
+		}
+		i915_request_put(rq);
+		if (err)
+			break;
+	}
+
+
+err_pin:
+	i915_vma_unpin(vma);
+err_map:
+	i915_gem_object_unpin_map(obj);
+err_obj:
+	i915_gem_object_put(obj);
+	return err;
+}
+
  static int live_busywait_preempt(void *arg)
  {
  	struct intel_gt *gt = arg;
@@ -2437,6 +2590,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
  		SUBTEST(live_unlite_switch),
  		SUBTEST(live_unlite_preempt),
  		SUBTEST(live_timeslice_preempt),
+		SUBTEST(live_timeslice_queue),
  		SUBTEST(live_busywait_preempt),
  		SUBTEST(live_preempt),
  		SUBTEST(live_late_preempt),
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 7b84ebca2901..0ca40f6bf08c 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -177,9 +177,22 @@ static inline int rq_prio(const struct i915_request *rq)
  	return rq->sched.attr.priority | __NO_PREEMPTION;
  }
  
+static inline bool need_preempt(int prio, int active)
+{
+	/*
+	 * Allow preemption of low -> normal -> high, but we do
+	 * not allow low priority tasks to preempt other low priority
+	 * tasks under the impression that latency for low priority
+	 * tasks does not matter (as much as background throughput),
+	 * so kiss.
+	 */
+	return prio >= max(I915_PRIORITY_NORMAL, active);

Is this the same as the current:

  return prio > max(I915_PRIORITY_NORMAL - 1, active);

Hm no, it now allows for high prio tasks to preempt equal high prio.

So it will kick the tasklet and dequeue will then decide no to 
need_preempt after all. Where it is the catch? Okay catch is in the 
other execlist_dequeue hunk - that it will now enable the timeslice timer.

I have to say it is getting very difficult to tie everything together. I 
wish for a simpler design with more centralized "magic" on hadling 
priorities etc.

+}
+
  static void kick_submission(struct intel_engine_cs *engine, int prio)
  {
-	const struct i915_request *inflight = *engine->execlists.active;
+	const struct i915_request *inflight =
+		execlists_active(&engine->execlists);
  
  	/*
  	 * If we are already the currently executing context, don't
@@ -188,7 +201,7 @@ static void kick_submission(struct intel_engine_cs *engine, int prio)
  	 * tasklet, i.e. we have not change the priority queue
  	 * sufficiently to oust the running context.
  	 */
-	if (!inflight || !i915_scheduler_need_preempt(prio, rq_prio(inflight)))
+	if (!inflight || !need_preempt(prio, rq_prio(inflight)))
  		return;
  
  	tasklet_hi_schedule(&engine->execlists.tasklet);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 7eefccff39bf..07d243acf553 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -52,22 +52,4 @@ static inline void i915_priolist_free(struct i915_priolist *p)
  		__i915_priolist_free(p);
  }
  
-static inline bool i915_scheduler_need_preempt(int prio, int active)
-{
-	/*
-	 * Allow preemption of low -> normal -> high, but we do
-	 * not allow low priority tasks to preempt other low priority
-	 * tasks under the impression that latency for low priority
-	 * tasks does not matter (as much as background throughput),
-	 * so kiss.
-	 *
-	 * More naturally we would write
-	 *	prio >= max(0, last);
-	 * except that we wish to prevent triggering preemption at the same
-	 * priority level: the task that is running should remain running
-	 * to preserve FIFO ordering of dependencies.
-	 */
-	return prio > max(I915_PRIORITY_NORMAL - 1, active);
-}
-
  #endif /* _I915_SCHEDULER_H_ */


Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx