Re: [PATCH] drm/sched: Add native dependency support to drm_sched

Donald Robson <Donald.Robson@xxxxxxxxxx> · Fri, 9 Jun 2023 09:29:47 +0000



For context, the native dependency support is used in these as yet
unsubmitted files:
https://gitlab.freedesktop.org/sarah-walker-imgtec/powervr/-/blob/dev/v3/drivers/gpu/drm/imagination/pvr_job.c
https://gitlab.freedesktop.org/sarah-walker-imgtec/powervr/-/blob/dev/v3/drivers/gpu/drm/imagination/pvr_queue.c

Thanks,
Donald

On Thu, 2023-06-08 at 14:23 +0100, Donald Robson wrote:
> This patch adds support for 'native' dependencies to DRM scheduler.  In
> drivers that use a firmware based scheduler there are performance gains
> to be had by allowing waits to happen in the firmware, as this reduces
> the latency between signalling and job submission.  Dependencies that
> can be awaited by the firmware scheduler are termed 'native
> dependencies'.  In the new PowerVR driver we delegate the waits to the
> firmware, but it is still necessary to expose these fences within DRM
> scheduler.  This is because when a job is cancelled
> drm_sched_entity_kill() registers callback to all the dependencies in
> order to ensure the job finished fence is not signalled before all the
> job dependencies are met, and if they were not exposed the core wouldn't
> be able to guarantee that anymore, and it might signal the fence too
> early leading to potential invalid accesses if other things depend on
> the job finished fence.
> 
> All dependencies are handled in the same way up to the point that
> dependencies for a job are being checked.  At this stage, DRM scheduler
> will now allow  job submission to proceed once it encounters the first
> native dependency in the list - dependencies having been sorted
> beforehand in drm_sched_job_arm() so that native ones appear last.  The
> list is sorted during drm_sched_job_arm() because the scheduler isn't
> known until this point, and determining whether a dependency is native
> is via a new drm_gpu_scheduler backend operation.
> 
> Native fences are just simple counters that get incremented every time
> some specific execution point is reached, like when a GPU job is done.
> The firmware is in charge of waiting but also updating fences, so it can
> easily unblock any waiters it has internally. The CPU also has access to
> these counters, so it can also check for progress.
> 
> TODO:
> 
> When operating normally the CPU is not supposed to update the counters
> itself, but there is one specific situation where this is needed - when
> a GPU hang happened and some context were declared faulty because they
> had unfinished or blocked jobs pending. In that situation, when we reset
> the GPU we will evict faulty contexts so they can't submit jobs anymore
> and we will cancel the jobs that were in-flight at the time of reset,
> but that's not enough because some jobs on other non-faulty contexts
> might have native dependencies on jobs that never completed on this
> faulty context.
> 
> If we were to ask the firmware to wait on those native fences, it would
> block indefinitely, because no one would ever update the counter. So, in
> that case, and that case only, we want the CPU to force-update the
> counter and set it to the last issued sequence number.  We do not
> currently have a helper for this and we welcome any suggestions for how
> best to implement it.
> 
> Signed-off-by: Donald Robson <donald.robson@xxxxxxxxxx>
> Cc: Luben Tuikov <luben.tuikov@xxxxxxx>
> Cc: David Airlie <airlied@xxxxxxxxx>
> Cc: Daniel Vetter <daniel@xxxxxxxx>
> Cc: Sumit Semwal <sumit.semwal@xxxxxxxxxx>
> Cc: "Christian König" <christian.koenig@xxxxxxx>
> Cc: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx>
> Cc: Frank Binns <frank.binns@xxxxxxxxxx>
> Cc: Sarah Walker <sarah.walker@xxxxxxxxxx>
> ---
>  drivers/gpu/drm/scheduler/sched_entity.c | 60 +++++++++++++--
>  drivers/gpu/drm/scheduler/sched_main.c   | 96 ++++++++++++++++++++++++
>  include/drm/gpu_scheduler.h              | 11 +++
>  3 files changed, 161 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 15d04a0ec623..2685805a5e05 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -155,13 +155,14 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>  {
>  	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>  						 finish_cb);
> +	unsigned long idx;
>  	int r;
>  
>  	dma_fence_put(f);
>  
>  	/* Wait for all dependencies to avoid data corruptions */
> -	while (!xa_empty(&job->dependencies)) {
> -		f = xa_erase(&job->dependencies, job->last_dependency++);
> +	xa_for_each(&job->dependencies, idx, f) {
> +		xa_erase(&job->dependencies, idx);
>  		r = dma_fence_add_callback(f, &job->finish_cb,
>  					   drm_sched_entity_kill_jobs_cb);
>  		if (!r)
> @@ -390,12 +391,59 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
>  	return false;
>  }
>  
> +/**
> + * dep_is_native - indicates that native dependencies are supported and that the
> + * dependency at @index is marked.
> + * @job: Scheduler job.
> + * @index: Index into the @job->dependencies xarray.
> + *
> + * Must only be used after calling drm_sched_job_arm().
> + *
> + * Returns true if both these conditions are met.
> + */
> +static bool dep_is_native(struct drm_sched_job *job, unsigned long index)
> +{
> +	return job->sched->ops->dependency_is_native &&
> +	       xa_get_mark(&job->dependencies, job->last_dependency, XA_MARK_0);
> +}
> +
>  static struct dma_fence *
> -drm_sched_job_dependency(struct drm_sched_job *job,
> -			 struct drm_sched_entity *entity)
> +drm_sched_job_dependency(struct drm_sched_job *job, struct drm_sched_entity *entity)
>  {
> -	if (!xa_empty(&job->dependencies))
> -		return xa_erase(&job->dependencies, job->last_dependency++);
> +	struct dma_fence *fence;
> +	unsigned long dep_index;
> +
> +	if (!dep_is_native(job, job->last_dependency)) {
> +		fence = xa_erase(&job->dependencies, job->last_dependency++);
> +		if (fence)
> +			return fence;
> +	}
> +
> +	xa_for_each_start(&job->dependencies, dep_index, fence,
> +			  job->last_dependency) {
> +		/*
> +		 * Encountered first native dependency. Since these were
> +		 * previously sorted to the end of the array in
> +		 * drm_sched_sort_native_deps(), all remaining entries
> +		 * will be native too, so we can just iterate through
> +		 * them.
> +		 *
> +		 * Native entries cannot be erased, as they need to be
> +		 * accessed by the driver's native scheduler.
> +		 *
> +		 * If the native fence is a drm_sched_fence object, we
> +		 * ensure the job has been submitted so drm_sched_fence
> +		 * ::parent points to a valid dma_fence object.
> +		 */
> +		struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
> +		struct dma_fence *scheduled_fence =
> +			s_fence ? dma_fence_get_rcu(&s_fence->scheduled) : NULL;
> +
> +		job->last_dependency = dep_index + 1;
> +
> +		if (scheduled_fence)
> +			return scheduled_fence;
> +	}
>  
>  	if (job->sched->ops->prepare_job)
>  		return job->sched->ops->prepare_job(job, entity);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 214364fccb71..08dcc33ec690 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -643,6 +643,92 @@ int drm_sched_job_init(struct drm_sched_job *job,
>  }
>  EXPORT_SYMBOL(drm_sched_job_init);
>  
> +/**
> + * drm_sched_sort_native_deps - relocates all native dependencies to the
> + * tail end of @job->dependencies.
> + * @job: target scheduler job.
> + *
> + * Starts by marking all of the native dependencies, then, in a quick-sort
> + * like manner it swaps entries using a head and tail index counter. Only
> + * a single partition is required, as there are only two values.
> + */
> +static void drm_sched_sort_native_deps(struct drm_sched_job *job)
> +{
> +	struct dma_fence *entry, *head = NULL, *tail = NULL;
> +	unsigned long h = 0, t = 0, native_dep_count = 0;
> +	XA_STATE(xas_head, &job->dependencies, 0);
> +	XA_STATE(xas_tail, &job->dependencies, 0);
> +	bool already_sorted = true;
> +
> +	if (!job->sched->ops->dependency_is_native)
> +		/* Driver doesn't support native deps. */
> +		return;
> +
> +	/* Mark all the native dependencies as we walk xas_tail to the end. */
> +	xa_lock(&job->dependencies);
> +	xas_for_each(&xas_tail, entry, ULONG_MAX) {
> +		/* Keep track of the index. */
> +		t++;
> +
> +		if (job->sched->ops->dependency_is_native(entry)) {
> +			xas_set_mark(&xas_tail, XA_MARK_0);
> +			native_dep_count++;
> +		} else if (native_dep_count) {
> +			/*
> +			 * As a native dep has been encountered before, we can
> +			 * infer the array is not already sorted.
> +			 */
> +			already_sorted = false;
> +		}
> +	}
> +	xa_unlock(&job->dependencies);
> +
> +	if (already_sorted)
> +		return;
> +
> +	/* xas_tail and t are now at the end of the array. */
> +	xa_lock(&job->dependencies);
> +	while (h < t) {
> +		if (!head) {
> +			/* Find a marked entry. */
> +			if (xas_get_mark(&xas_head, XA_MARK_0)) {
> +				head = xas_load(&xas_head);
> +			} else {
> +				xas_next(&xas_head);
> +				h++;
> +			}
> +		}
> +		if (!tail) {
> +			/* Find an unmarked entry. */
> +			if (xas_get_mark(&xas_tail, XA_MARK_0)) {
> +				xas_prev(&xas_tail);
> +				t--;
> +			} else {
> +				tail = xas_load(&xas_tail);
> +			}
> +		}
> +		if (head && tail) {
> +			/*
> +			 * Swap!
> +			 * These stores should never allocate, since they both
> +			 * already exist, hence they never fail.
> +			 */
> +			xas_store(&xas_head, tail);
> +			xas_store(&xas_tail, head);
> +
> +			/* Also swap the mark. */
> +			xas_clear_mark(&xas_head, XA_MARK_0);
> +			xas_set_mark(&xas_tail, XA_MARK_0);
> +
> +			head = NULL;
> +			tail = NULL;
> +			h++;
> +			t--;
> +		}
> +	}
> +	xa_unlock(&job->dependencies);
> +}
> +
>  /**
>   * drm_sched_job_arm - arm a scheduler job for execution
>   * @job: scheduler job to arm
> @@ -669,6 +755,7 @@ void drm_sched_job_arm(struct drm_sched_job *job)
>  	job->s_priority = entity->rq - sched->sched_rq;
>  	job->id = atomic64_inc_return(&sched->job_id_count);
>  
> +	drm_sched_sort_native_deps(job);
>  	drm_sched_fence_init(job->s_fence, job->entity);
>  }
>  EXPORT_SYMBOL(drm_sched_job_arm);
> @@ -1045,6 +1132,15 @@ static int drm_sched_main(void *param)
>  		trace_drm_run_job(sched_job, entity);
>  		fence = sched->ops->run_job(sched_job);
>  		complete_all(&entity->entity_idle);
> +
> +		/* We need to set the parent before signaling the scheduled
> +		 * fence if we want native dependency to work properly. If we
> +		 * don't, the driver might try to access the parent before
> +		 * it's set.
> +		 */
> +		if (!IS_ERR_OR_NULL(fence))
> +			drm_sched_fence_set_parent(s_fence, fence);
> +
>  		drm_sched_fence_scheduled(s_fence);
>  
>  		if (!IS_ERR_OR_NULL(fence)) {
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 898608f87b96..dca6be35e517 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -455,6 +455,17 @@ struct drm_sched_backend_ops {
>           * and it's time to clean it up.
>  	 */
>  	void (*free_job)(struct drm_sched_job *sched_job);
> +
> +	/**
> +	 * @dependency_is_native: When arming a job for this scheduler, this
> +	 * function will be called to determine whether to treat it as a
> +	 * native dependency. A native dependency is awaited and cleaned up
> +	 * when the job is cancelled, but responsibility is otherwise delegated
> +	 * to a native scheduler in the calling driver code.
> +	 *
> +	 * Optional - implies support for native dependencies.
> +	 */
> +	bool (*dependency_is_native)(struct dma_fence *fence);
>  };
>  
>  /**