[PATCH] drm/sched: Add native dependency support to drm_sched

Donald Robson <Donald.Robson@xxxxxxxxxx> · Thu, 8 Jun 2023 13:23:26 +0000

This patch adds support for 'native' dependencies to DRM scheduler.  In
drivers that use a firmware based scheduler there are performance gains
to be had by allowing waits to happen in the firmware, as this reduces
the latency between signalling and job submission.  Dependencies that
can be awaited by the firmware scheduler are termed 'native
dependencies'.  In the new PowerVR driver we delegate the waits to the
firmware, but it is still necessary to expose these fences within DRM
scheduler.  This is because when a job is cancelled
drm_sched_entity_kill() registers callback to all the dependencies in
order to ensure the job finished fence is not signalled before all the
job dependencies are met, and if they were not exposed the core wouldn't
be able to guarantee that anymore, and it might signal the fence too
early leading to potential invalid accesses if other things depend on
the job finished fence.

All dependencies are handled in the same way up to the point that
dependencies for a job are being checked.  At this stage, DRM scheduler
will now allow  job submission to proceed once it encounters the first
native dependency in the list - dependencies having been sorted
beforehand in drm_sched_job_arm() so that native ones appear last.  The
list is sorted during drm_sched_job_arm() because the scheduler isn't
known until this point, and determining whether a dependency is native
is via a new drm_gpu_scheduler backend operation.

Native fences are just simple counters that get incremented every time
some specific execution point is reached, like when a GPU job is done.
The firmware is in charge of waiting but also updating fences, so it can
easily unblock any waiters it has internally. The CPU also has access to
these counters, so it can also check for progress.

TODO:

When operating normally the CPU is not supposed to update the counters
itself, but there is one specific situation where this is needed - when
a GPU hang happened and some context were declared faulty because they
had unfinished or blocked jobs pending. In that situation, when we reset
the GPU we will evict faulty contexts so they can't submit jobs anymore
and we will cancel the jobs that were in-flight at the time of reset,
but that's not enough because some jobs on other non-faulty contexts
might have native dependencies on jobs that never completed on this
faulty context.

If we were to ask the firmware to wait on those native fences, it would
block indefinitely, because no one would ever update the counter. So, in
that case, and that case only, we want the CPU to force-update the
counter and set it to the last issued sequence number.  We do not
currently have a helper for this and we welcome any suggestions for how
best to implement it.

Signed-off-by: Donald Robson <donald.robson@xxxxxxxxxx>
Cc: Luben Tuikov <luben.tuikov@xxxxxxx>
Cc: David Airlie <airlied@xxxxxxxxx>
Cc: Daniel Vetter <daniel@xxxxxxxx>
Cc: Sumit Semwal <sumit.semwal@xxxxxxxxxx>
Cc: "Christian König" <christian.koenig@xxxxxxx>
Cc: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx>
Cc: Frank Binns <frank.binns@xxxxxxxxxx>
Cc: Sarah Walker <sarah.walker@xxxxxxxxxx>
---
 drivers/gpu/drm/scheduler/sched_entity.c | 60 +++++++++++++--
 drivers/gpu/drm/scheduler/sched_main.c   | 96 ++++++++++++++++++++++++
 include/drm/gpu_scheduler.h              | 11 +++
 3 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 15d04a0ec623..2685805a5e05 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -155,13 +155,14 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
 {
 	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
 						 finish_cb);
+	unsigned long idx;
 	int r;
 
 	dma_fence_put(f);
 
 	/* Wait for all dependencies to avoid data corruptions */
-	while (!xa_empty(&job->dependencies)) {
-		f = xa_erase(&job->dependencies, job->last_dependency++);
+	xa_for_each(&job->dependencies, idx, f) {
+		xa_erase(&job->dependencies, idx);
 		r = dma_fence_add_callback(f, &job->finish_cb,
 					   drm_sched_entity_kill_jobs_cb);
 		if (!r)
@@ -390,12 +391,59 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
 	return false;
 }
 
+/**
+ * dep_is_native - indicates that native dependencies are supported and that the
+ * dependency at @index is marked.
+ * @job: Scheduler job.
+ * @index: Index into the @job->dependencies xarray.
+ *
+ * Must only be used after calling drm_sched_job_arm().
+ *
+ * Returns true if both these conditions are met.
+ */
+static bool dep_is_native(struct drm_sched_job *job, unsigned long index)
+{
+	return job->sched->ops->dependency_is_native &&
+	       xa_get_mark(&job->dependencies, job->last_dependency, XA_MARK_0);
+}
+
 static struct dma_fence *
-drm_sched_job_dependency(struct drm_sched_job *job,
-			 struct drm_sched_entity *entity)
+drm_sched_job_dependency(struct drm_sched_job *job, struct drm_sched_entity *entity)
 {
-	if (!xa_empty(&job->dependencies))
-		return xa_erase(&job->dependencies, job->last_dependency++);
+	struct dma_fence *fence;
+	unsigned long dep_index;
+
+	if (!dep_is_native(job, job->last_dependency)) {
+		fence = xa_erase(&job->dependencies, job->last_dependency++);
+		if (fence)
+			return fence;
+	}
+
+	xa_for_each_start(&job->dependencies, dep_index, fence,
+			  job->last_dependency) {
+		/*
+		 * Encountered first native dependency. Since these were
+		 * previously sorted to the end of the array in
+		 * drm_sched_sort_native_deps(), all remaining entries
+		 * will be native too, so we can just iterate through
+		 * them.
+		 *
+		 * Native entries cannot be erased, as they need to be
+		 * accessed by the driver's native scheduler.
+		 *
+		 * If the native fence is a drm_sched_fence object, we
+		 * ensure the job has been submitted so drm_sched_fence
+		 * ::parent points to a valid dma_fence object.
+		 */
+		struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
+		struct dma_fence *scheduled_fence =
+			s_fence ? dma_fence_get_rcu(&s_fence->scheduled) : NULL;
+
+		job->last_dependency = dep_index + 1;
+
+		if (scheduled_fence)
+			return scheduled_fence;
+	}
 
 	if (job->sched->ops->prepare_job)
 		return job->sched->ops->prepare_job(job, entity);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 214364fccb71..08dcc33ec690 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -643,6 +643,92 @@ int drm_sched_job_init(struct drm_sched_job *job,
 }
 EXPORT_SYMBOL(drm_sched_job_init);
 
+/**
+ * drm_sched_sort_native_deps - relocates all native dependencies to the
+ * tail end of @job->dependencies.
+ * @job: target scheduler job.
+ *
+ * Starts by marking all of the native dependencies, then, in a quick-sort
+ * like manner it swaps entries using a head and tail index counter. Only
+ * a single partition is required, as there are only two values.
+ */
+static void drm_sched_sort_native_deps(struct drm_sched_job *job)
+{
+	struct dma_fence *entry, *head = NULL, *tail = NULL;
+	unsigned long h = 0, t = 0, native_dep_count = 0;
+	XA_STATE(xas_head, &job->dependencies, 0);
+	XA_STATE(xas_tail, &job->dependencies, 0);
+	bool already_sorted = true;
+
+	if (!job->sched->ops->dependency_is_native)
+		/* Driver doesn't support native deps. */
+		return;
+
+	/* Mark all the native dependencies as we walk xas_tail to the end. */
+	xa_lock(&job->dependencies);
+	xas_for_each(&xas_tail, entry, ULONG_MAX) {
+		/* Keep track of the index. */
+		t++;
+
+		if (job->sched->ops->dependency_is_native(entry)) {
+			xas_set_mark(&xas_tail, XA_MARK_0);
+			native_dep_count++;
+		} else if (native_dep_count) {
+			/*
+			 * As a native dep has been encountered before, we can
+			 * infer the array is not already sorted.
+			 */
+			already_sorted = false;
+		}
+	}
+	xa_unlock(&job->dependencies);
+
+	if (already_sorted)
+		return;
+
+	/* xas_tail and t are now at the end of the array. */
+	xa_lock(&job->dependencies);
+	while (h < t) {
+		if (!head) {
+			/* Find a marked entry. */
+			if (xas_get_mark(&xas_head, XA_MARK_0)) {
+				head = xas_load(&xas_head);
+			} else {
+				xas_next(&xas_head);
+				h++;
+			}
+		}
+		if (!tail) {
+			/* Find an unmarked entry. */
+			if (xas_get_mark(&xas_tail, XA_MARK_0)) {
+				xas_prev(&xas_tail);
+				t--;
+			} else {
+				tail = xas_load(&xas_tail);
+			}
+		}
+		if (head && tail) {
+			/*
+			 * Swap!
+			 * These stores should never allocate, since they both
+			 * already exist, hence they never fail.
+			 */
+			xas_store(&xas_head, tail);
+			xas_store(&xas_tail, head);
+
+			/* Also swap the mark. */
+			xas_clear_mark(&xas_head, XA_MARK_0);
+			xas_set_mark(&xas_tail, XA_MARK_0);
+
+			head = NULL;
+			tail = NULL;
+			h++;
+			t--;
+		}
+	}
+	xa_unlock(&job->dependencies);
+}
+
 /**
  * drm_sched_job_arm - arm a scheduler job for execution
  * @job: scheduler job to arm
@@ -669,6 +755,7 @@ void drm_sched_job_arm(struct drm_sched_job *job)
 	job->s_priority = entity->rq - sched->sched_rq;
 	job->id = atomic64_inc_return(&sched->job_id_count);
 
+	drm_sched_sort_native_deps(job);
 	drm_sched_fence_init(job->s_fence, job->entity);
 }
 EXPORT_SYMBOL(drm_sched_job_arm);
@@ -1045,6 +1132,15 @@ static int drm_sched_main(void *param)
 		trace_drm_run_job(sched_job, entity);
 		fence = sched->ops->run_job(sched_job);
 		complete_all(&entity->entity_idle);
+
+		/* We need to set the parent before signaling the scheduled
+		 * fence if we want native dependency to work properly. If we
+		 * don't, the driver might try to access the parent before
+		 * it's set.
+		 */
+		if (!IS_ERR_OR_NULL(fence))
+			drm_sched_fence_set_parent(s_fence, fence);
+
 		drm_sched_fence_scheduled(s_fence);
 
 		if (!IS_ERR_OR_NULL(fence)) {
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 898608f87b96..dca6be35e517 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -455,6 +455,17 @@ struct drm_sched_backend_ops {
          * and it's time to clean it up.
 	 */
 	void (*free_job)(struct drm_sched_job *sched_job);
+
+	/**
+	 * @dependency_is_native: When arming a job for this scheduler, this
+	 * function will be called to determine whether to treat it as a
+	 * native dependency. A native dependency is awaited and cleaned up
+	 * when the job is cancelled, but responsibility is otherwise delegated
+	 * to a native scheduler in the calling driver code.
+	 *
+	 * Optional - implies support for native dependencies.
+	 */
+	bool (*dependency_is_native)(struct dma_fence *fence);
 };
 
 /**
-- 
2.25.1