Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Thu, 10 Nov 2016 15:36:17 +0000

On 10/11/2016 15:01, Chris Wilson wrote:
On Thu, Nov 10, 2016 at 02:45:39PM +0000, Tvrtko Ursulin wrote:

On 07/11/2016 13:59, Chris Wilson wrote:
The scheduler needs to know the dependencies of each request for the
lifetime of the request, as it may choose to reschedule the requests at
any time and must ensure the dependency tree is not broken. This is in
additional to using the fence to only allow execution after all
dependencies have been completed.

One option was to extend the fence to support the bidirectional
dependency tracking required by the scheduler. However the mismatch in
lifetimes between the submit fence and the request essentially meant
that we had to build a completely separate struct (and we could not
simply reuse the existing waitqueue in the fence for one half of the
dependency tracking). The extra dependency tracking simply did not mesh
well with the fence, and keeping it separate both keeps the fence
implementation simpler and allows us to extend the dependency tracking
into a priority tree (whilst maintaining support for reordering the
tree).

To avoid the additional allocations and list manipulations, the use of
the priotree is disabled when there are no schedulers to use it.

v2: Create a dedicated slab for i915_dependency.
   Rename the lists.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
drivers/gpu/drm/i915/i915_drv.h            |   1 +
drivers/gpu/drm/i915/i915_gem.c            |  14 ++-
drivers/gpu/drm/i915/i915_gem_request.c    |  96 +++++++++++++++++++-
drivers/gpu/drm/i915/i915_gem_request.h    |  40 ++++++++-
drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
9 files changed, 282 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cb96d260dfb..dac435680e98 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
			  struct drm_i915_gem_request *rq,
			  const char *prefix)
{
-	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+		   rq->priotree.priority,
		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
		   rq->timeline->common->name);
}
@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)

		if (i915.enable_execlists) {
			u32 ptr, read, write;
+			struct rb_node *rb;

			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
			rcu_read_unlock();

			spin_lock_irq(&engine->timeline->lock);
-			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
+			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
+				rq = rb_entry(rb, typeof(*rq), priotree.node);
				print_request(m, rq, "\t\tQ ");
			}
			spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4735b4177100..e790147209f3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1791,6 +1791,7 @@ struct drm_i915_private {
	struct kmem_cache *objects;
	struct kmem_cache *vmas;
	struct kmem_cache *requests;
+	struct kmem_cache *dependencies;

	const struct intel_device_info info;

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index df803e82eb07..a4dc2da2323a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2693,10 +2693,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)

		spin_lock_irqsave(&engine->timeline->lock, flags);

-		INIT_LIST_HEAD(&engine->execlist_queue);
		i915_gem_request_put(engine->execlist_port[0].request);
		i915_gem_request_put(engine->execlist_port[1].request);
		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		engine->execlist_queue = RB_ROOT;
+		engine->execlist_first = NULL;

		spin_unlock_irqrestore(&engine->timeline->lock, flags);
	}
@@ -4754,12 +4755,18 @@ i915_gem_load_init(struct drm_device *dev)
	if (!dev_priv->requests)
		goto err_vmas;

+	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
+					    SLAB_HWCACHE_ALIGN |
+					    SLAB_RECLAIM_ACCOUNT);
+	if (!dev_priv->dependencies)
+		goto err_requests;
+
	mutex_lock(&dev_priv->drm.struct_mutex);
	INIT_LIST_HEAD(&dev_priv->gt.timelines);
	err = i915_gem_timeline_init__global(dev_priv);
	mutex_unlock(&dev_priv->drm.struct_mutex);
	if (err)
-		goto err_requests;
+		goto err_dependencies;

	INIT_LIST_HEAD(&dev_priv->context_list);
	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
@@ -4787,6 +4794,8 @@ i915_gem_load_init(struct drm_device *dev)

	return 0;

+err_dependencies:
+	kmem_cache_destroy(dev_priv->dependencies);
err_requests:
	kmem_cache_destroy(dev_priv->requests);
err_vmas:
@@ -4803,6 +4812,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)

	WARN_ON(!llist_empty(&dev_priv->mm.free_list));

+	kmem_cache_destroy(dev_priv->dependencies);
	kmem_cache_destroy(dev_priv->requests);
	kmem_cache_destroy(dev_priv->vmas);
	kmem_cache_destroy(dev_priv->objects);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 93f77df9bc51..278b103a4e95 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -113,6 +113,82 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
	spin_unlock(&file_priv->mm.lock);
}

+static struct i915_dependency *
+i915_dependency_alloc(struct drm_i915_private *i915)
+{
+	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
+}
+
+static void
+i915_dependency_free(struct drm_i915_private *i915,
+		     struct i915_dependency *dep)
+{
+	kmem_cache_free(i915->dependencies, dep);
+}
+
+static void
+__i915_priotree_add_dependency(struct i915_priotree *pt,
+			       struct i915_priotree *signal,
+			       struct i915_dependency *dep,
+			       unsigned long flags)
+{
+	INIT_LIST_HEAD(&dep->dfs_link);
+	list_add(&dep->wait_link, &signal->waiters_list);
+	list_add(&dep->signal_link, &pt->signalers_list);
+	dep->signaler = signal;
+	dep->flags = flags;
+}
+
+static int
+i915_priotree_add_dependency(struct drm_i915_private *i915,
+			     struct i915_priotree *pt,
+			     struct i915_priotree *signal)
+{
+	struct i915_dependency *dep;
+
+	dep = i915_dependency_alloc(i915);
+	if (!dep)
+		return -ENOMEM;
+
+	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
+	return 0;
+}
+
+static void
+i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
+{
+	struct i915_dependency *dep, *next;
+
+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
+
+	/* Everyone we depended upon (the fences we wait to be signaled)
+	 * should retire before us and remove themselves from our list.
+	 * However, retirement is run independently on each timeline and
+	 * so we may be called out-of-order.
+	 */
+	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
+		list_del(&dep->wait_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+
+	/* Remove ourselves from everyone who depends upon us */
+	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
+		list_del(&dep->signal_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+}
+
+static void
+i915_priotree_init(struct i915_priotree *pt)
+{
+	INIT_LIST_HEAD(&pt->signalers_list);
+	INIT_LIST_HEAD(&pt->waiters_list);
+	RB_CLEAR_NODE(&pt->node);
+	pt->priority = INT_MIN;

Not I915_PRIORITY_MIN? Or it has to be smaller? In which case
BUILD_BUG_ON(INT_MIN >= I915_PRIORITY_MIN)?

I wanted it to be smaller than min so that the unset value was clear in
any debug trace.

+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;

What does setting the priority to INT_MAX here do?

It is used as a signal that the request has been sent to hardware
both as a shortcircuit for the dfs and so that it is clear in the
debugfs.

+static struct intel_engine_cs *
+pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
+{
+	struct intel_engine_cs *engine;
+
+	engine = container_of(pt,
+			      struct drm_i915_gem_request,
+			      priotree)->engine;
+	if (engine != locked) {
+		if (locked)
+			spin_unlock_irq(&locked->timeline->lock);
+		spin_lock_irq(&engine->timeline->lock);
+	}
+
+	return engine;
+}

Ha, cute. :)

+
+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
+{
+	struct intel_engine_cs *engine = NULL;
+	struct i915_dependency *dep, *p;
+	struct i915_dependency stack;
+	LIST_HEAD(dfs);
+
+	if (prio <= READ_ONCE(request->priotree.priority))
+		return;
+
+	/* Need BKL in order to use the temporary link inside i915_dependency */
+	lockdep_assert_held(&request->i915->drm.struct_mutex);

Could use a new i915 submission lock, but I suppose this is OK to
start with.

+
+	stack.signaler = &request->priotree;
+	list_add(&stack.dfs_link, &dfs);
+

Tada, onto the DFS which I am not familiar with - but there's always
Wikipedia. :)

+	/* Recursively bump all dependent priorities to match the new request */
+	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
+		struct i915_priotree *pt = dep->signaler;
+
+		list_for_each_entry(p, &pt->signalers_list, signal_link)
+			if (prio > READ_ONCE(p->signaler->priority))
+				list_move_tail(&p->dfs_link, &dfs);
+
+		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);

This was fun. :)

Maybe clearer as:

p = list_next_entry(dep, typeof(*p), dfs_link) ?


+		if (!RB_EMPTY_NODE(&pt->node))
+			continue;
+
+		engine = pt_lock_engine(pt, engine);
+
+		/* If it is not already in the rbtree, we can update the
+		 * priority inplace and skip over it (and its dependencies)
+		 * if it is referenced again as we descend the dfs.
+		 */

Are you sure it is OK to ignore the dependencies for new requests? I
don't see why it would be.

We don't ignore the dependencies for new requests, they have already
been added to the list to be processed. What we are considering here is
what happens if this request is a dependency of a subsequent request in
the list. My statement is that since we are not in the rbtree, we are
not ready to be run (and in turn neither will be the following request
that depended upon this request). As we are not in the rbtree, we do not
need to reorder the rbtree to ensure the fifo ordering with our
dependencies.

Yes I've lost context while looking at it, you are right.

+		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {

Isn't the node guaranteed to be empty from the earlier test and continue?

We checked before we were certain we had the spinlock.

I missed that as well, makes sense.


+			pt->priority = prio;
+			list_del_init(&dep->dfs_link);
+		}
+	}
+
+	/* Fifo and depth-first replacement ensure our deps execute before us */
+	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
+		struct i915_priotree *pt = dep->signaler;
+
+		INIT_LIST_HEAD(&dep->dfs_link);
+
+		engine = pt_lock_engine(pt, engine);
+
+		if (prio <= pt->priority)
+			continue;

How would these priorities end up in the list? The first loop skips
over them.

We keep on dropping the lock.

Yeah, looks OK to me.

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx