Re: [PATCH 5/5] drm/sched: Make use of a "done" list (v2)

Christian König <christian.koenig@xxxxxxx> · Fri, 4 Dec 2020 09:16:50 +0100

Am 04.12.20 um 04:17 schrieb Luben Tuikov:
The drm_sched_job_done() callback now moves done
jobs from the pending list to a "done" list.

In drm_sched_job_timeout, make use of the status
returned by a GPU driver job timeout handler to
decide whether to leave the oldest job in the
pending list, or to send it off to the done list.
If a driver's job timeout callback returns a
status that that job is done, it is added to the
done list and the done thread woken up. If that
job needs more time, it is left on the pending
list and the timeout timer restarted.

The idea is that a GPU driver can check the IP to
which the passed-in job belongs to and determine
whether the IP is alive and well, or if it needs
more time to complete this job and perhaps others
also executing on it.

In drm_sched_job_timeout(), the main scheduler
thread is now parked, before calling a driver's
timeout_job callback, so as to not compete pushing
jobs down to the GPU while the recovery method is
taking place.

Eliminate the polling mechanism of picking out done
jobs from the pending list, i.e. eliminate
drm_sched_get_cleanup_job().

This also eliminates the eldest job disappearing
from the pending list, while the driver timeout
handler is called.

Various other optimizations to the GPU scheduler
and job recovery are possible with this format.

Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>

Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx>
Cc: Christian König <christian.koenig@xxxxxxx>
Cc: Daniel Vetter <daniel.vetter@xxxxxxxx>
Cc: Lucas Stach <l.stach@xxxxxxxxxxxxxx>
Cc: Russell King <linux+etnaviv@xxxxxxxxxxxxxxx>
Cc: Christian Gmeiner <christian.gmeiner@xxxxxxxxx>
Cc: Qiang Yu <yuq825@xxxxxxxxx>
Cc: Rob Herring <robh@xxxxxxxxxx>
Cc: Tomeu Vizoso <tomeu.vizoso@xxxxxxxxxxxxx>
Cc: Steven Price <steven.price@xxxxxxx>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@xxxxxxxxxxxxx>
Cc: Eric Anholt <eric@xxxxxxxxxx>

v2: Dispell using a done thread, so as to keep
     the cache hot on the same processor.
---
  drivers/gpu/drm/scheduler/sched_main.c | 247 +++++++++++++------------
  include/drm/gpu_scheduler.h            |   4 +
  2 files changed, 134 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index b9876cad94f2..d77180b44998 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -164,7 +164,9 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
   * drm_sched_job_done - complete a job
   * @s_job: pointer to the job which is done
   *
- * Finish the job's fence and wake up the worker thread.
+ * Move the completed task to the done list,
+ * signal the its fence to mark it finished,
+ * and wake up the worker thread.
   */
  static void drm_sched_job_done(struct drm_sched_job *s_job)
  {
@@ -176,9 +178,14 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
  
  	trace_drm_sched_process_job(s_fence);
  
+	spin_lock(&sched->job_list_lock);
+	list_move(&s_job->list, &sched->done_list);
+	spin_unlock(&sched->job_list_lock);
+

That is racy, as soon as the spinlock is dropped the job and with it the 
s_fence might haven been destroyed.

  	dma_fence_get(&s_fence->finished);
  	drm_sched_fence_finished(s_fence);
  	dma_fence_put(&s_fence->finished);

In other words this here needs to come first.

Regards,
Christian.

+
  	wake_up_interruptible(&sched->wake_up_worker);
  }
  
@@ -309,6 +316,37 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
  	spin_unlock(&sched->job_list_lock);
  }
  
+/** drm_sched_job_timeout -- a timer timeout occurred
+ * @work: pointer to work_struct
+ *
+ * First, park the scheduler thread whose IP timed out,
+ * so that we don't race with the scheduler thread pushing
+ * jobs down the IP as we try to investigate what
+ * happened and give drivers a chance to recover.
+ *
+ * Second, take the fist job in the pending list
+ * (oldest), leave it in the pending list and call the
+ * driver's timer timeout callback to find out what
+ * happened, passing this job as the suspect one.
+ *
+ * The driver may return DRM_TASK_STATUS COMPLETE,
+ * which means the task is not in the IP(*) and we move
+ * it to the done list to free it.
+ *
+ * (*) A reason for this would be, say, that the job
+ * completed in due time, or the driver has aborted
+ * this job using driver specific methods in the
+ * timedout_job callback and has now removed it from
+ * the hardware.
+ *
+ * Or, the driver may return DRM_TASK_STATUS_ALIVE, to
+ * indicate that it had inquired about this job, and it
+ * has verified that this job is alive and well, and
+ * that the DRM layer should give this task more time
+ * to complete. In this case, we restart the timeout timer.
+ *
+ * Lastly, we unpark the scheduler thread.
+ */
  static void drm_sched_job_timedout(struct work_struct *work)
  {
  	struct drm_gpu_scheduler *sched;
@@ -316,37 +354,32 @@ static void drm_sched_job_timedout(struct work_struct *work)
  
  	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
  
-	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
+	kthread_park(sched->thread);
+
  	spin_lock(&sched->job_list_lock);
  	job = list_first_entry_or_null(&sched->pending_list,
  				       struct drm_sched_job, list);
+	spin_unlock(&sched->job_list_lock);
  
  	if (job) {
-		/*
-		 * Remove the bad job so it cannot be freed by concurrent
-		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
-		 * is parked at which point it's safe.
-		 */
-		list_del_init(&job->list);
-		spin_unlock(&sched->job_list_lock);
-
-		job->sched->ops->timedout_job(job);
+		int res;
  
-		/*
-		 * Guilty job did complete and hence needs to be manually removed
-		 * See drm_sched_stop doc.
-		 */
-		if (sched->free_guilty) {
-			job->sched->ops->free_job(job);
-			sched->free_guilty = false;
+		res = job->sched->ops->timedout_job(job);
+		if (res == DRM_TASK_STATUS_COMPLETE) {
+			/* The job is out of the device.
+			 */
+			spin_lock(&sched->job_list_lock);
+			list_move(&job->list, &sched->done_list);
+			spin_unlock(&sched->job_list_lock);
+			wake_up_interruptible(&sched->wake_up_worker);
+		} else {
+			/* The job needs more time.
+			 */
+			drm_sched_start_timeout(sched);
  		}
-	} else {
-		spin_unlock(&sched->job_list_lock);
  	}
  
-	spin_lock(&sched->job_list_lock);
-	drm_sched_start_timeout(sched);
-	spin_unlock(&sched->job_list_lock);
+	kthread_unpark(sched->thread);
  }
  
   /**
@@ -413,24 +446,13 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
  	kthread_park(sched->thread);
  
  	/*
-	 * Reinsert back the bad job here - now it's safe as
-	 * drm_sched_get_cleanup_job cannot race against us and release the
-	 * bad job at this point - we parked (waited for) any in progress
-	 * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-	 * now until the scheduler thread is unparked.
-	 */
-	if (bad && bad->sched == sched)
-		/*
-		 * Add at the head of the queue to reflect it was the earliest
-		 * job extracted.
-		 */
-		list_add(&bad->list, &sched->pending_list);
-
-	/*
-	 * Iterate the job list from later to  earlier one and either deactive
-	 * their HW callbacks or remove them from pending list if they already
-	 * signaled.
-	 * This iteration is thread safe as sched thread is stopped.
+	 * Iterate the pending list in reverse order,
+	 * from most recently submitted to oldest
+	 * tasks. Tasks which haven't completed, leave
+	 * them in the pending list, but decrement
+	 * their hardware run queue count.
+	 * Else, the fence must've signalled, and the job
+	 * is in the done list.
  	 */
  	list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
  					 list) {
@@ -439,36 +461,52 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
  					      &s_job->cb)) {
  			atomic_dec(&sched->hw_rq_count);
  		} else {
-			/*
-			 * remove job from pending_list.
-			 * Locking here is for concurrent resume timeout
-			 */
-			spin_lock(&sched->job_list_lock);
-			list_del_init(&s_job->list);
-			spin_unlock(&sched->job_list_lock);
-
-			/*
-			 * Wait for job's HW fence callback to finish using s_job
-			 * before releasing it.
-			 *
-			 * Job is still alive so fence refcount at least 1
-			 */
-			dma_fence_wait(&s_job->s_fence->finished, false);
-
-			/*
-			 * We must keep bad job alive for later use during
-			 * recovery by some of the drivers but leave a hint
-			 * that the guilty job must be released.
-			 */
-			if (bad != s_job)
-				sched->ops->free_job(s_job);
-			else
-				sched->free_guilty = true;
+			if (bad == s_job) {
+				/* This is the oldest job on the pending list
+				 * whose IP timed out. The
+				 * drm_sched_job_timeout() function calls the
+				 * driver's timedout_job callback passing @bad,
+				 * who then calls this function here--as such
+				 * we shouldn't move @bad or free it. This will
+				 * be decided by drm_sched_job_timeout() when
+				 * this function here returns back to the caller
+				 * (the driver) and the driver's timedout_job
+				 * callback returns a result to
+				 * drm_sched_job_timeout().
+				 */
+				;
+			} else {
+				int res;
+
+				/* This job is not the @bad job passed above.
+				 * Note that perhaps it was *this* job which
+				 * timed out. The wait below is suspect. Since,
+				 * it waits with maximum timeout and "intr" set
+				 * to false, it will either return 0 indicating
+				 * that the fence has signalled, or negative on
+				 * error. What if, the whole IP is stuck and
+				 * this ends up waiting forever?
+				 *
+				 * Wait for job's HW fence callback to finish
+				 * using s_job before releasing it.
+				 *
+				 * Job is still alive so fence
+				 * refcount at least 1
+				 */
+				res = dma_fence_wait(&s_job->s_fence->finished,
+						     false);
+
+				if (res == 0)
+					sched->ops->free_job(s_job);
+				else
+					pr_err_once("%s: dma_fence_wait: %d\n",
+						    sched->name, res);
+			}
  		}
  	}
  
  	/*
-	 * Stop pending timer in flight as we rearm it in  drm_sched_start. This
+	 * Stop pending timer in flight as we rearm it in drm_sched_start. This
  	 * avoids the pending timeout work in progress to fire right away after
  	 * this TDR finished and before the newly restarted jobs had a
  	 * chance to complete.
@@ -511,8 +549,9 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
  			else if (r)
  				DRM_ERROR("fence add callback failed (%d)\n",
  					  r);
-		} else
+		} else {
  			drm_sched_job_done(s_job);
+		}
  	}
  
  	if (full_recovery) {
@@ -665,47 +704,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
  	return entity;
  }
  
-/**
- * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
- *
- * @sched: scheduler instance
- *
- * Returns the next finished job from the pending list (if there is one)
- * ready for it to be destroyed.
- */
-static struct drm_sched_job *
-drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
-{
-	struct drm_sched_job *job;
-
-	/*
-	 * Don't destroy jobs while the timeout worker is running  OR thread
-	 * is being parked and hence assumed to not touch pending_list
-	 */
-	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-	    !cancel_delayed_work(&sched->work_tdr)) ||
-	    kthread_should_park())
-		return NULL;
-
-	spin_lock(&sched->job_list_lock);
-
-	job = list_first_entry_or_null(&sched->pending_list,
-				       struct drm_sched_job, list);
-
-	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
-		/* remove job from pending_list */
-		list_del_init(&job->list);
-	} else {
-		job = NULL;
-		/* queue timeout for next job */
-		drm_sched_start_timeout(sched);
-	}
-
-	spin_unlock(&sched->job_list_lock);
-
-	return job;
-}
-
  /**
   * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
   * @sched_list: list of drm_gpu_schedulers
@@ -759,6 +757,25 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
  	return false;
  }
  
+static void drm_sched_free_done(struct drm_gpu_scheduler *sched)
+{
+	LIST_HEAD(done_q);
+
+	spin_lock(&sched->job_list_lock);
+	list_splice_init(&sched->done_list, &done_q);
+	spin_unlock(&sched->job_list_lock);
+
+	while (!list_empty(&done_q)) {
+		struct drm_sched_job *job;
+
+		job = list_first_entry(&done_q,
+				       struct drm_sched_job,
+				       list);
+		list_del_init(&job->list);
+		sched->ops->free_job(job);
+	}
+}
+
  /**
   * drm_sched_main - main scheduler thread
   *
@@ -768,7 +785,7 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
   */
  static int drm_sched_main(void *param)
  {
-	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
+	struct drm_gpu_scheduler *sched = param;
  	int r;
  
  	sched_set_fifo_low(current);
@@ -778,19 +795,14 @@ static int drm_sched_main(void *param)
  		struct drm_sched_fence *s_fence;
  		struct drm_sched_job *sched_job;
  		struct dma_fence *fence;
-		struct drm_sched_job *cleanup_job = NULL;
  
  		wait_event_interruptible(sched->wake_up_worker,
-					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
+					 (!list_empty(&sched->done_list)) ||
  					 (!drm_sched_blocked(sched) &&
  					  (entity = drm_sched_select_entity(sched))) ||
  					 kthread_should_stop());
  
-		if (cleanup_job) {
-			sched->ops->free_job(cleanup_job);
-			/* queue timeout for next job */
-			drm_sched_start_timeout(sched);
-		}
+		drm_sched_free_done(sched);
  
  		if (!entity)
  			continue;
@@ -864,6 +876,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
  	init_waitqueue_head(&sched->wake_up_worker);
  	init_waitqueue_head(&sched->job_scheduled);
  	INIT_LIST_HEAD(&sched->pending_list);
+	INIT_LIST_HEAD(&sched->done_list);
  	spin_lock_init(&sched->job_list_lock);
  	atomic_set(&sched->hw_rq_count, 0);
  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index cedfc5394e52..11278695fed0 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -289,6 +289,7 @@ struct drm_gpu_scheduler {
  	uint32_t			hw_submission_limit;
  	long				timeout;
  	const char			*name;
+
  	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
  	wait_queue_head_t		wake_up_worker;
  	wait_queue_head_t		job_scheduled;
@@ -296,8 +297,11 @@ struct drm_gpu_scheduler {
  	atomic64_t			job_id_count;
  	struct delayed_work		work_tdr;
  	struct task_struct		*thread;
+
  	struct list_head		pending_list;
+	struct list_head                done_list;
  	spinlock_t			job_list_lock;
+
  	int				hang_limit;
  	atomic_t                        score;
  	bool				ready;

_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel