Dying process might be blocked from receiving any more signals so avoid using it. Also retire enity->fini_status and just check the SW queue, if it's not empty do the fallback cleanup. Also handle entity->last_scheduled == NULL use case which happens when HW ring is already hangged whem a new entity tried to enqeue jobs. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 47 ++++++++++++++++++++++--------- include/drm/gpu_scheduler.h | 1 - 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 44d4807..4d038f9 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -135,7 +135,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, entity->rq = rq; entity->sched = sched; entity->guilty = guilty; - entity->fini_status = 0; entity->last_scheduled = NULL; spin_lock_init(&entity->rq_lock); @@ -173,7 +172,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (spsc_queue_peek(&entity->job_queue) == NULL) + + if (entity->rq == NULL || spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -227,12 +227,16 @@ void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL */ - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) - entity->fini_status = -ERESTARTSYS; + if ((current->flags & PF_EXITING)) + wait_event_timeout(sched->job_scheduled, + drm_sched_entity_is_idle(entity), msecs_to_jiffies(1000)); else - entity->fini_status = wait_event_killable(sched->job_scheduled, - drm_sched_entity_is_idle(entity)); - drm_sched_entity_set_rq(entity, NULL); + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity)); + + + /* For killed process disable any more IBs enqueue right now */ + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + drm_sched_entity_set_rq(entity, NULL); } EXPORT_SYMBOL(drm_sched_entity_do_release); @@ -247,7 +251,13 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - if (entity->fini_status) { + + drm_sched_entity_set_rq(entity, NULL); + + /* Consumption of existing IBs wasn't completed. Forcefully + * remove them here. + */ + if (spsc_queue_peek(&entity->job_queue)) { struct drm_sched_job *job; int r; @@ -267,12 +277,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_fence *s_fence = job->s_fence; drm_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, - drm_sched_entity_kill_jobs_cb); - if (r == -ENOENT) + + /* + * When pipe is hanged by older entity, new entity might + * not even have chance to submit it's first job to HW + * and so entity->last_scheduled will remain NULL + */ + if (!entity->last_scheduled) drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); - else if (r) - DRM_ERROR("fence add callback failed (%d)\n", r); + else { + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, + drm_sched_entity_kill_jobs_cb); + if (r == -ENOENT) + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + else if (r) + DRM_ERROR("fence add callback failed (%d)\n", r); + } } } @@ -713,6 +733,7 @@ static int drm_sched_main(void *param) continue; sched_job = drm_sched_entity_pop_job(entity); + if (!sched_job) continue; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index dec6558..d220ac9 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -64,7 +64,6 @@ struct drm_sched_entity { struct dma_fence *dependency; struct dma_fence_cb cb; atomic_t *guilty; /* points to ctx's guilty */ - int fini_status; struct dma_fence *last_scheduled; }; -- 2.7.4