Re: [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Thu, 3 Nov 2016 16:21:25 +0000

Hi,

1st pass comments only for now.

On 02/11/2016 17:50, Chris Wilson wrote:
Track the priority of each request and use it to determine the order in
which we submit requests to the hardware via execlists.

The priority of the request is determined by the user (eventually via
the context) but may be overridden at any time by the driver. When we set
the priority of the request, we bump the priority of all of its
dependencies to match - so that a high priority drawing operation is not
stuck behind a background task.

When the request is ready to execute (i.e. we have signaled the submit
fence following completion of all its dependencies, including third
party fences), we put the request into a priority sorted rbtree to be
submitted to the hardware. If the request is higher priority than all
pending requests, it will be submitted on the next context-switch
interrupt as soon as the hardware has completed the current request. We
do not currently preempt any current execution to immediately run a very
high priority request, at least not yet.

One more limitation, is that this is first implementation is for
execlists only so currently limited to gen8/gen9.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
 drivers/gpu/drm/i915/i915_gem.c            |   3 +-
 drivers/gpu/drm/i915/i915_gem_request.c    |   4 ++
 drivers/gpu/drm/i915/i915_gem_request.h    |   6 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |   4 ++
 drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 104 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
 8 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cb96d260dfb..dac435680e98 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
 			  struct drm_i915_gem_request *rq,
 			  const char *prefix)
 {
-	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,

Noticed this is quite unreadable due the range of INT_MIN to INT_MAX. Do 
we need such granularity or could use something smaller so it looks 
nicer here? I know, the argument is quite poor. :)

 		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+		   rq->priotree.priority,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
 		   rq->timeline->common->name);
 }
@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)

 		if (i915.enable_execlists) {
 			u32 ptr, read, write;
+			struct rb_node *rb;

 			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
 				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 			rcu_read_unlock();

 			spin_lock_irq(&engine->timeline->lock);
-			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
+			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
+				rq = rb_entry(rb, typeof(*rq), priotree.node);
 				print_request(m, rq, "\t\tQ ");
 			}
 			spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cf28666021f8..4697848ecfd9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)

 	if (i915.enable_execlists) {
 		spin_lock_irq(&engine->timeline->lock);
-		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		engine->execlist_queue = RB_ROOT;
+		engine->execlist_first = NULL;
 		spin_unlock_irq(&engine->timeline->lock);
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 13090f226203..5f0068fac3e9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt)
 {
 	struct i915_dependency *dep, *next;

+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
+
 	/* Everyone we depended upon should retire before us and remove
 	 * themselves from our list. However, retirement is run independently
 	 * on each timeline and so we may be called out-of-order.
@@ -164,6 +166,8 @@ i915_priotree_init(struct i915_priotree *pt)
 {
 	INIT_LIST_HEAD(&pt->pre_list);
 	INIT_LIST_HEAD(&pt->post_list);
+	RB_CLEAR_NODE(&pt->node);
+	pt->priority = INT_MIN;
 }

 void i915_gem_retire_noop(struct i915_gem_active *active,
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 0cb2ba546062..4288f512ac51 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -51,6 +51,9 @@ struct i915_dependency {
 struct i915_priotree {
 	struct list_head pre_list; /* who is before us, we depend upon */
 	struct list_head post_list; /* who is after us, they depend upon us */
+	struct rb_node node;
+	int priority;
+#define I915_PRIORITY_MAX 1024

Unused in this patch.

 };

 /**
@@ -168,9 +171,6 @@ struct drm_i915_gem_request {
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
 	struct list_head client_list;
-
-	/** Link in the execlist submission queue, guarded by execlist_lock. */
-	struct list_head execlist_link;
 };

 extern const struct dma_fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index ddb574d5ceda..b31573a825fa 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -640,6 +640,9 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;

+	rq->previous_context = rq->engine->last_context;
+	rq->engine->last_context = rq->ctx;
+
 	__i915_gem_request_submit(rq);

 	spin_lock(&client->wq_lock);
@@ -1522,6 +1525,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = i915_guc_submit;
+		engine->schedule = NULL;

 		/* Replay the current set of previously submitted requests */
 		list_for_each_entry(request,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c9171a058478..3da4d466e332 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
  */
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
-	INIT_LIST_HEAD(&engine->execlist_queue);
+	engine->execlist_queue = RB_ROOT;
+	engine->execlist_first = NULL;

 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 186c3ccb2c34..70ac74c959bd 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,

 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *cursor, *last;
+	struct drm_i915_gem_request *last;
 	struct execlist_port *port = engine->execlist_port;
 	unsigned long flags;
+	struct rb_node *rb;
 	bool submit = false;

 	last = port->request;
@@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */

 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
 		 * (e.g. GVT saying never to combine contexts).
@@ -503,16 +508,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			port++;
 		}

+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		/* We keep the previous context alive until we retire the
+		 * following request. This ensures that any the context object
+		 * is still pinned for any residual writes the HW makes into
+		 * it on the context switch into the next object following the
+		 * breadcrumb. Otherwise, we may retire the context too early.
+		 */
+		cursor->previous_context = engine->last_context;
+		engine->last_context = cursor->ctx;
+

Will we will need a knob to control the amount of context merging? Until 
preemption that is, similar to GuC queue depth "nerfing" from the other 
patch.

 		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
 	}
 	if (submit) {
-		/* Decouple all the requests submitted from the queue */
-		engine->execlist_queue.next = &cursor->execlist_link;
-		cursor->execlist_link.prev = &engine->execlist_queue;
-
 		i915_gem_request_assign(&port->request, last);
+		engine->execlist_first = rb;
 	}
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);

@@ -595,26 +611,77 @@ static void intel_lrc_irq_handler(unsigned long data)
 	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }

+static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
+{
+	struct rb_node **p, *rb;
+	bool first = true;
+
+	/* most positive priority is scheduled first, equal priorities fifo */
+	rb = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct i915_priotree *pos;
+
+		rb = *p;
+		pos = rb_entry(rb, typeof(*pos), node);
+		if (pt->priority > pos->priority) {
+			p = &rb->rb_left;
+		} else {
+			p = &rb->rb_right;
+			first = false;
+		}
+	}
+	rb_link_node(&pt->node, rb, p);
+	rb_insert_color(&pt->node, root);
+
+	return first;
+}
+
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;

 	assert_spin_locked(&engine->timeline->lock);

-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	request->previous_context = engine->last_context;
-	engine->last_context = request->ctx;
-
-	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (insert_request(&request->priotree, &engine->execlist_queue))
+		engine->execlist_first = &request->priotree.node;
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 }

+static void update_priorities(struct i915_priotree *pt, int prio)
+{
+	struct drm_i915_gem_request *request =
+		container_of(pt, struct drm_i915_gem_request, priotree);
+	struct intel_engine_cs *engine = request->engine;
+	struct i915_dependency *dep;
+
+	if (prio <= READ_ONCE(pt->priority))
+		return;
+
+	/* Recursively bump all dependent priorities to match the new request */
+	list_for_each_entry(dep, &pt->pre_list, pre_link)
+		update_priorities(dep->signal, prio);

John got in trouble from recursion in his scheduler, used for the same 
thing AFAIR. Or was it the priority bumping? Either way, it could be 
imperative to avoid it.

+
+	/* Fifo and depth-first replacement ensure our deps execute before us */
+	spin_lock_irq(&engine->timeline->lock);
+	pt->priority = prio;
+	if (!RB_EMPTY_NODE(&pt->node)) {
+		rb_erase(&pt->node, &engine->execlist_queue);
+		if (insert_request(pt, &engine->execlist_queue))
+			engine->execlist_first = &pt->node;
+	}
+	spin_unlock_irq(&engine->timeline->lock);
+}
+
+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
+{
+	/* Make sure that our entire dependency chain shares our prio */
+	update_priorities(&request->priotree, prio);
+
+	/* XXX Do we need to preempt to make room for us and our deps? */
+}

Hm, why isn't scheduling backend agnostic? Makes it much simpler to do 
the first implementation like this?

+
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -1651,8 +1718,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;

-	for_each_engine(engine, dev_priv, id)
+	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = execlists_submit_request;
+		engine->schedule = execlists_schedule;
+	}
 }

 static void
@@ -1665,6 +1734,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
 	engine->submit_request = execlists_submit_request;
+	engine->schedule = execlists_schedule;

 	engine->irq_enable = gen8_logical_ring_enable_irq;
 	engine->irq_disable = gen8_logical_ring_disable_irq;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 75991a3c694b..cbc148863a03 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -348,7 +348,8 @@ struct intel_engine_cs {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
 	} execlist_port[2];
-	struct list_head execlist_queue;
+	struct rb_root execlist_queue;
+	struct rb_node *execlist_first;
 	unsigned int fw_domains;
 	bool disable_lite_restore_wa;
 	bool preempt_wa;


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx