[Why] for bailing job, this commit will delete it from pending list thus the bailing job will never have a chance to be resubmitted even in advance tdr mode. [How] after embeded hw_fence into amdgpu_job is done, the race condition that this commit tries to work around is completely solved.So revert this commit. This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90. v2: add dma_fence_get/put() around timedout_job to avoid concurrent delete during processing timedout_job v3: park sched->thread instead during timedout_job. Signed-off-by: Jingwen Chen <Jingwen.Chen2@xxxxxxx> --- drivers/gpu/drm/scheduler/sched_main.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index a2a953693b45..c187fd3a6bb6 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -319,17 +319,12 @@ static void drm_sched_job_timedout(struct work_struct *work) sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work); /* Protects against concurrent deletion in drm_sched_get_cleanup_job */ + kthread_park(sched->thread); spin_lock(&sched->job_list_lock); job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list); if (job) { - /* - * Remove the bad job so it cannot be freed by concurrent - * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread - * is parked at which point it's safe. - */ - list_del_init(&job->list); spin_unlock(&sched->job_list_lock); status = job->sched->ops->timedout_job(job); @@ -345,6 +340,7 @@ static void drm_sched_job_timedout(struct work_struct *work) } else { spin_unlock(&sched->job_list_lock); } + kthread_unpark(sched->thread); if (status != DRM_GPU_SCHED_STAT_ENODEV) { spin_lock(&sched->job_list_lock); @@ -392,20 +388,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) kthread_park(sched->thread); - /* - * Reinsert back the bad job here - now it's safe as - * drm_sched_get_cleanup_job cannot race against us and release the - * bad job at this point - we parked (waited for) any in progress - * (earlier) cleanups and drm_sched_get_cleanup_job will not be called - * now until the scheduler thread is unparked. - */ - if (bad && bad->sched == sched) - /* - * Add at the head of the queue to reflect it was the earliest - * job extracted. - */ - list_add(&bad->list, &sched->pending_list); - /* * Iterate the job list from later to earlier one and either deactive * their HW callbacks or remove them from pending list if they already -- 2.25.1