Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job

Christian König <christian.koenig@xxxxxxx> · Wed, 19 Sep 2018 14:30:49 +0200

Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
having a delayed work item per job is redundant as we only need one
per scheduler to track the time out the currently executing job.

Well that looks simpler than I thought it would be.

But it shows the next problem that the timeout and the completion could 
race.

As far as I can see that can be fixed by moving the 
dma_fence_remove_callback()/dma_fence_add_callback() dance from 
drm_sched_hw_job_reset() to drm_sched_job_timedout().

Anyway, I would say drop patch #1 and fix the one comment below and we 
can use this.


Signed-off-by: Nayan Deshmukh <nayan26deshmukh@xxxxxxxxx>
Suggested-by: Christian König <christian.koenig@xxxxxxx>
---
  drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
  include/drm/gpu_scheduler.h            |  6 +++---
  2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 0e6ccc8243db..f213b5c7f718 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct *work)
  	 * manages to find this job as the next job in the list, the fence
  	 * signaled check below will prevent the timeout to be restarted.
  	 */
-	cancel_delayed_work_sync(&s_job->work_tdr);
+	cancel_delayed_work_sync(&sched->work_tdr);
  
  	spin_lock(&sched->job_list_lock);
  	/* queue TDR for next job */
@@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct *work)
  	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
  	    !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
  		if (!dma_fence_is_signaled(&next->s_fence->finished))

Since we now have only one delayed work item we can just drop the test 
if next is already signaled.

Regards,
Christian.

-			schedule_delayed_work(&next->work_tdr, sched->timeout);
+			schedule_delayed_work(&sched->work_tdr, sched->timeout);
  	}
  	/* remove job from ring_mirror_list */
  	list_del(&s_job->node);
@@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
  	if (list_first_entry_or_null(&sched->ring_mirror_list,
  				struct drm_sched_job, node) == s_job) {
  		if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
-			schedule_delayed_work(&s_job->work_tdr, sched->timeout);
+			schedule_delayed_work(&sched->work_tdr, sched->timeout);
  		sched->curr_job = s_job;
  	}
  	spin_unlock(&sched->job_list_lock);
@@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
  
  static void drm_sched_job_timedout(struct work_struct *work)
  {
-	struct drm_sched_job *job = container_of(work, struct drm_sched_job,
-						 work_tdr.work);
+	struct drm_gpu_scheduler *sched = container_of(work,
+						struct drm_gpu_scheduler,
+						work_tdr.work);
+	struct drm_sched_job *job = sched->curr_job;
  
  	job->sched->ops->timedout_job(job);
  }
@@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
  	s_job = list_first_entry_or_null(&sched->ring_mirror_list,
  					 struct drm_sched_job, node);
  	if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
-		schedule_delayed_work(&s_job->work_tdr, sched->timeout);
+		schedule_delayed_work(&sched->work_tdr, sched->timeout);
  	if (s_job)
  		sched->curr_job = s_job;
  
@@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
  
  	INIT_WORK(&job->finish_work, drm_sched_job_finish);
  	INIT_LIST_HEAD(&job->node);
-	INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
  
  	return 0;
  }
@@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
  	INIT_LIST_HEAD(&sched->ring_mirror_list);
  	spin_lock_init(&sched->job_list_lock);
  	atomic_set(&sched->hw_rq_count, 0);
+	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
  	atomic_set(&sched->num_jobs, 0);
  	atomic64_set(&sched->job_id_count, 0);
  
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 07e776b1ca42..9d50d7f3eaa4 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
   *               finished to remove the job from the
   *               @drm_gpu_scheduler.ring_mirror_list.
   * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.
- * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout
- *            interval is over.
   * @id: a unique id assigned to each job scheduled on the scheduler.
   * @karma: increment on every hang caused by this job. If this exceeds the hang
   *         limit of the scheduler then the job is marked guilty and will not
@@ -195,7 +193,6 @@ struct drm_sched_job {
  	struct dma_fence_cb		finish_cb;
  	struct work_struct		finish_work;
  	struct list_head		node;
-	struct delayed_work		work_tdr;
  	uint64_t			id;
  	atomic_t			karma;
  	enum drm_sched_priority		s_priority;
@@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
   *                 finished.
   * @hw_rq_count: the number of jobs currently in the hardware queue.
   * @job_id_count: used to assign unique id to the each job.
+ * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
+ *            timeout interval is over.
   * @thread: the kthread on which the scheduler which run.
   * @ring_mirror_list: the list of jobs which are currently in the job queue.
   * @job_list_lock: lock to protect the ring_mirror_list.
@@ -280,6 +279,7 @@ struct drm_gpu_scheduler {
  	wait_queue_head_t		job_scheduled;
  	atomic_t			hw_rq_count;
  	atomic64_t			job_id_count;
+	struct delayed_work		work_tdr;
  	struct task_struct		*thread;
  	struct list_head		ring_mirror_list;
  	spinlock_t			job_list_lock;

_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel