Re: [PATCH 3/5] drm/i915/execlists: Direct submit onto idle engines

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 09/05/2018 15:27, Chris Wilson wrote:
Bypass using the tasklet to submit the first request to HW, as the
tasklet may be deferred unto ksoftirqd and at a minimum will add in
excess of 10us (and maybe tens of milliseconds) to our execution
latency. This latency reduction is most notable when execution flows
between engines.

v2: Beware handling preemption completion from the direct submit path as
well.
v3: Make the abuse clear and track our extra state inside i915_tasklet.

Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
  drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
  drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
  3 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
index 42b002b88edb..99e2fa2241ba 100644
--- a/drivers/gpu/drm/i915/i915_tasklet.h
+++ b/drivers/gpu/drm/i915/i915_tasklet.h
@@ -8,8 +8,11 @@
  #define _I915_TASKLET_H_
#include <linux/atomic.h>
+#include <linux/bitops.h>
  #include <linux/interrupt.h>
+#include "i915_gem.h"
+
  /**
   * struct i915_tasklet - wrapper around tasklet_struct
   *
@@ -19,6 +22,8 @@
   */
  struct i915_tasklet {
  	struct tasklet_struct base;
+	unsigned long flags;
+#define I915_TASKLET_DIRECT_SUBMIT BIT(0)

I would suggest a more generic name for the bit since i915_tasklet is generic-ish. For instance simply I915_TASKLET_DIRECT would signify the callback has been invoked directly and not (necessarily) from softirq context. Then it is for each user to know what that means for them specifically.

  };
static inline void i915_tasklet_init(struct i915_tasklet *t,
@@ -43,6 +48,14 @@ static inline bool i915_tasklet_is_enabled(const struct i915_tasklet *t)
  	return likely(!atomic_read(&t->base.count));
  }
+static inline bool i915_tasklet_is_direct_submit(const struct i915_tasklet *t)
+{
+	/* Only legal to be checked from inside the tasklet. */
+	GEM_BUG_ON(!i915_tasklet_is_running(t));
+
+	return t->flags & I915_TASKLET_DIRECT_SUBMIT;
+}

Or maybe i915_tasklet_direct_invocation?

+
  static inline void i915_tasklet_schedule(struct i915_tasklet *t)
  {
  	tasklet_hi_schedule(&t->base);
@@ -75,4 +88,15 @@ static inline void i915_tasklet_set_func(struct i915_tasklet *t,
  	i915_tasklet_unlock(t);
  }
+static inline void __i915_tasklet_run(const struct i915_tasklet *t)
+{
+	t->base.func(t->base.data);
+}
+
+static inline void i915_tasklet_run(const struct i915_tasklet *t)
+{
+	GEM_BUG_ON(!i915_tasklet_is_running(t));
+	__i915_tasklet_run(t);
+}
+
  #endif /* _I915_TASKLET_H_ */
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index a7afc976c3b9..f2ded1796523 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -754,14 +754,18 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
static void guc_dequeue(struct intel_engine_cs *engine)
  {
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
  	bool submit;
local_irq_save(flags); - spin_lock(&engine->timeline.lock);
+	if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
+		spin_lock(&engine->timeline.lock);
+
  	submit = __guc_dequeue(engine);
-	spin_unlock(&engine->timeline.lock);
+
+	if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
+		spin_unlock(&engine->timeline.lock);
if (submit)
  		guc_submit(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 539fa03d7600..09fded9d409f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -356,13 +356,15 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
  {
  	struct intel_engine_cs *engine =
  		container_of(execlists, typeof(*engine), execlists);
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
- spin_lock_irqsave(&engine->timeline.lock, flags);
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_lock_irqsave(&engine->timeline.lock, flags);
__unwind_incomplete_requests(engine); - spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
static inline void
@@ -601,6 +603,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
  		 */
  		GEM_BUG_ON(!execlists_is_active(execlists,
  						EXECLISTS_ACTIVE_USER));
+		GEM_BUG_ON(execlists_is_active(execlists,
+					       EXECLISTS_ACTIVE_PREEMPT));
  		GEM_BUG_ON(!port_count(&port[0]));
  		if (port_count(&port[0]) > 1)
  			return false;
@@ -757,12 +761,16 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
  static void execlists_dequeue(struct intel_engine_cs *engine)
  {
  	struct intel_engine_execlists * const execlists = &engine->execlists;
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
  	bool submit;
- spin_lock_irqsave(&engine->timeline.lock, flags);
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_lock_irqsave(&engine->timeline.lock, flags);
+
  	submit = __execlists_dequeue(engine);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
if (submit)
  		execlists_submit_ports(engine);
@@ -1162,16 +1170,52 @@ static void queue_request(struct intel_engine_cs *engine,
  		      &lookup_priolist(engine, prio)->requests);
  }
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
  {
  	engine->execlists.queue_priority = prio;
+}

Why is this called wakeup? Plans to add something in it later?

+
+static void __schedule_queue(struct intel_engine_cs *engine)
+{
  	i915_tasklet_schedule(&engine->execlists.tasklet);
  }
+static bool __direct_submit(struct intel_engine_execlists *const execlists)
+{
+	struct i915_tasklet * const t = &execlists->tasklet;
+
+	if (!tasklet_trylock(&t->base))
+		return false;
+
+	t->flags |= I915_TASKLET_DIRECT_SUBMIT;
+	i915_tasklet_run(t);
+	t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
+
+	tasklet_unlock(&t->base);

Feels like this whole sequence belongs to i915_tasklet since it touches the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?

+	return true;
+}
+
+static void __submit_queue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+
+	GEM_BUG_ON(!engine->i915->gt.awake);
+
+	/* If inside GPU reset, the tasklet will be queued later. */
+	if (!i915_tasklet_is_enabled(&execlists->tasklet))
+		return;
+
+	/* Directly submit the first request to reduce the initial latency */
+	if (port_isset(execlists->port) || !__direct_submit(execlists))
+		__schedule_queue(engine);

Hmm a bit evil to maybe invoke in the condition. Would it be acceptable to:

if (!port_isset(...))
	i915_tasklet_run_or_schedule(...);
else
	i915_tasklet_schedule(...);

It's not ideal but maybe a bit better.

+}
+
  static void submit_queue(struct intel_engine_cs *engine, int prio)
  {
-	if (prio > engine->execlists.queue_priority)
-		__submit_queue(engine, prio);
+	if (prio > engine->execlists.queue_priority) {
+		__wakeup_queue(engine, prio);
+		__submit_queue(engine);
+	}
  }
static void execlists_submit_request(struct i915_request *request)
@@ -1183,10 +1227,9 @@ static void execlists_submit_request(struct i915_request *request)
  	spin_lock_irqsave(&engine->timeline.lock, flags);
queue_request(engine, &request->sched, rq_prio(request));
-	submit_queue(engine, rq_prio(request));
-
  	GEM_BUG_ON(!engine->execlists.first);
  	GEM_BUG_ON(list_empty(&request->sched.link));
+	submit_queue(engine, rq_prio(request));
spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
@@ -1314,8 +1357,10 @@ static void execlists_schedule(struct i915_request *request,
  		}
if (prio > engine->execlists.queue_priority &&
-		    i915_sw_fence_done(&sched_to_request(node)->submit))
-			__submit_queue(engine, prio);
+		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
+			__wakeup_queue(engine, prio);
+			__schedule_queue(engine);
+		}
  	}
spin_unlock_irq(&engine->timeline.lock);


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux