commit f7fe64ad0f22 ("drm/sched: Split free_job into own work item") causes graphics hangs at GDM or right after logging in on a Framework 13 AMD laptop (containing a Phoenix APU). This reverts commit f7fe64ad0f22ff034f8ebcfbd7299ee9cc9b57d7. Fixes: f7fe64ad0f22 ("drm/sched: Split free_job into own work item") Signed-off-by: Mario Limonciello <mario.limonciello@xxxxxxx> --- This is a regression introduced in 6.8-rc1, bisected from 6.7. This revert done on top of 6.8-rc1 fixes the issue. I'm happy to gather any data to use to properly debug if that is preferable to a revert. --- drivers/gpu/drm/scheduler/sched_main.c | 133 +++++++++---------------- include/drm/gpu_scheduler.h | 4 +- 2 files changed, 48 insertions(+), 89 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 550492a7a031..91c96ab53fb5 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -369,32 +369,6 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched) queue_work(sched->submit_wq, &sched->work_run_job); } -/** - * __drm_sched_run_free_queue - enqueue free-job work - * @sched: scheduler instance - */ -static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) -{ - if (!READ_ONCE(sched->pause_submit)) - queue_work(sched->submit_wq, &sched->work_free_job); -} - -/** - * drm_sched_run_free_queue - enqueue free-job work if ready - * @sched: scheduler instance - */ -static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) -{ - struct drm_sched_job *job; - - spin_lock(&sched->job_list_lock); - job = list_first_entry_or_null(&sched->pending_list, - struct drm_sched_job, list); - if (job && dma_fence_is_signaled(&job->s_fence->finished)) - __drm_sched_run_free_queue(sched); - spin_unlock(&sched->job_list_lock); -} - /** * drm_sched_job_done - complete a job * @s_job: pointer to the job which is done @@ -414,7 +388,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result) dma_fence_get(&s_fence->finished); drm_sched_fence_finished(s_fence, result); dma_fence_put(&s_fence->finished); - __drm_sched_run_free_queue(sched); + drm_sched_run_job_queue(sched); } /** @@ -1092,10 +1066,8 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched) typeof(*next), list); if (next) { - if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, - &next->s_fence->scheduled.flags)) - next->s_fence->scheduled.timestamp = - dma_fence_timestamp(&job->s_fence->finished); + next->s_fence->scheduled.timestamp = + dma_fence_timestamp(&job->s_fence->finished); /* start TO timer for next job */ drm_sched_start_timeout(sched); } @@ -1145,29 +1117,7 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list, EXPORT_SYMBOL(drm_sched_pick_best); /** - * drm_sched_free_job_work - worker to call free_job - * - * @w: free job work - */ -static void drm_sched_free_job_work(struct work_struct *w) -{ - struct drm_gpu_scheduler *sched = - container_of(w, struct drm_gpu_scheduler, work_free_job); - struct drm_sched_job *job; - - if (READ_ONCE(sched->pause_submit)) - return; - - job = drm_sched_get_finished_job(sched); - if (job) - sched->ops->free_job(job); - - drm_sched_run_free_queue(sched); - drm_sched_run_job_queue(sched); -} - -/** - * drm_sched_run_job_work - worker to call run_job + * drm_sched_run_job_work - main scheduler thread * * @w: run job work */ @@ -1176,50 +1126,64 @@ static void drm_sched_run_job_work(struct work_struct *w) struct drm_gpu_scheduler *sched = container_of(w, struct drm_gpu_scheduler, work_run_job); struct drm_sched_entity *entity; - struct dma_fence *fence; - struct drm_sched_fence *s_fence; - struct drm_sched_job *sched_job; + struct drm_sched_job *cleanup_job; int r; if (READ_ONCE(sched->pause_submit)) return; + cleanup_job = drm_sched_get_finished_job(sched); entity = drm_sched_select_entity(sched); - if (!entity) - return; - sched_job = drm_sched_entity_pop_job(entity); - if (!sched_job) { - complete_all(&entity->entity_idle); + if (!entity && !cleanup_job) return; /* No more work */ - } - s_fence = sched_job->s_fence; + if (cleanup_job) + sched->ops->free_job(cleanup_job); - atomic_add(sched_job->credits, &sched->credit_count); - drm_sched_job_begin(sched_job); + if (entity) { + struct dma_fence *fence; + struct drm_sched_fence *s_fence; + struct drm_sched_job *sched_job; + + sched_job = drm_sched_entity_pop_job(entity); + if (!sched_job) { + complete_all(&entity->entity_idle); + if (!cleanup_job) + return; /* No more work */ + goto again; + } - trace_drm_run_job(sched_job, entity); - fence = sched->ops->run_job(sched_job); - complete_all(&entity->entity_idle); - drm_sched_fence_scheduled(s_fence, fence); + s_fence = sched_job->s_fence; - if (!IS_ERR_OR_NULL(fence)) { - /* Drop for original kref_init of the fence */ - dma_fence_put(fence); + atomic_inc(&sched->credit_count); + drm_sched_job_begin(sched_job); - r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_job_done_cb); - if (r == -ENOENT) - drm_sched_job_done(sched_job, fence->error); - else if (r) - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); - } else { - drm_sched_job_done(sched_job, IS_ERR(fence) ? - PTR_ERR(fence) : 0); + trace_drm_run_job(sched_job, entity); + fence = sched->ops->run_job(sched_job); + complete_all(&entity->entity_idle); + drm_sched_fence_scheduled(s_fence, fence); + + if (!IS_ERR_OR_NULL(fence)) { + /* Drop for original kref_init of the fence */ + dma_fence_put(fence); + + r = dma_fence_add_callback(fence, &sched_job->cb, + drm_sched_job_done_cb); + if (r == -ENOENT) + drm_sched_job_done(sched_job, fence->error); + else if (r) + DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", + r); + } else { + drm_sched_job_done(sched_job, IS_ERR(fence) ? + PTR_ERR(fence) : 0); + } + + wake_up(&sched->job_scheduled); } - wake_up(&sched->job_scheduled); +again: drm_sched_run_job_queue(sched); } @@ -1304,7 +1268,6 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, atomic_set(&sched->credit_count, 0); INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); INIT_WORK(&sched->work_run_job, drm_sched_run_job_work); - INIT_WORK(&sched->work_free_job, drm_sched_free_job_work); atomic_set(&sched->_score, 0); atomic64_set(&sched->job_id_count, 0); sched->pause_submit = false; @@ -1432,7 +1395,6 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched) { WRITE_ONCE(sched->pause_submit, true); cancel_work_sync(&sched->work_run_job); - cancel_work_sync(&sched->work_free_job); } EXPORT_SYMBOL(drm_sched_wqueue_stop); @@ -1445,6 +1407,5 @@ void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched) { WRITE_ONCE(sched->pause_submit, false); queue_work(sched->submit_wq, &sched->work_run_job); - queue_work(sched->submit_wq, &sched->work_free_job); } EXPORT_SYMBOL(drm_sched_wqueue_start); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 5acc64954a88..72f64bd0a6eb 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -495,10 +495,9 @@ struct drm_sched_backend_ops { * waits on this wait queue until all the scheduled jobs are * finished. * @job_id_count: used to assign unique id to the each job. - * @submit_wq: workqueue used to queue @work_run_job and @work_free_job + * @submit_wq: workqueue used to queue @work_run_job * @timeout_wq: workqueue used to queue @work_tdr * @work_run_job: work which calls run_job op of each scheduler. - * @work_free_job: work which calls free_job op of each scheduler. * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the * timeout interval is over. * @pending_list: the list of jobs which are currently in the job queue. @@ -528,7 +527,6 @@ struct drm_gpu_scheduler { struct workqueue_struct *submit_wq; struct workqueue_struct *timeout_wq; struct work_struct work_run_job; - struct work_struct work_free_job; struct delayed_work work_tdr; struct list_head pending_list; spinlock_t job_list_lock; -- 2.34.1