On 06/01/2018 01:22 PM, Christian König wrote: > Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky: >> Dying process might be blocked from receiving any more signals >> so avoid using it. >> >> Also retire enity->fini_status and just check the SW queue, >> if it's not empty do the fallback cleanup. >> >> Also handle entity->last_scheduled == NULL use case which >> happens when HW ring is already hangged whem a new entity >> tried to enqeue jobs. >> >> v2: >> Return the remaining timeout and use that as parameter for the next >> call. >> This way when we need to cleanup multiple queues we don't wait for the >> entire TO period for each queue but rather in total. >> Styling comments. >> Rebase. >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> >> --- >>  drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 >> ++++++++++++++++++++++++------- >>  include/drm/gpu_scheduler.h              | 7 +-- >>  2 files changed, 61 insertions(+), 20 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> index 8c1e80c..c594d17 100644 >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct >> drm_gpu_scheduler *sched, >>      entity->rq = rq; >>      entity->sched = sched; >>      entity->guilty = guilty; >> -   entity->fini_status = 0; >>      entity->last_scheduled = NULL; >>       spin_lock_init(&entity->rq_lock); >> @@ -219,7 +218,8 @@ static bool >> drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, >>  static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) >>  { >>      rmb(); >> -   if (spsc_queue_peek(&entity->job_queue) == NULL) >> + >> +   if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) >>          return true; >>       return false; >> @@ -260,25 +260,48 @@ static void >> drm_sched_entity_kill_jobs_cb(struct dma_fence *f, >>   * >>   * @sched: scheduler instance >>   * @entity: scheduler entity >> + * @timeout: time to wait in ms for Q to become empty. >>   * >>   * Splitting drm_sched_entity_fini() into two functions, The first >> one does the waiting, >>   * removes the entity from the runqueue and returns an error when >> the process was killed. >> + * >> + * Returns amount of time spent in waiting for TO. >> + * 0 if wait wasn't with time out. >> + * MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with condition >> false >> + * Number of MS spent in waiting before condition became true >> + * >>   */ >> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >> -              struct drm_sched_entity *entity) >> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >> +              struct drm_sched_entity *entity, unsigned timeout) > > Better use long for return type and timeout. > >>  { >> +   unsigned ret = 0; > > Also use a long here and initialize it with timeout. Please see bellow > >> + >>      if (!drm_sched_entity_is_initialized(sched, entity)) >>          return; >>      /** >>       * The client will not queue more IBs during this fini, consume >> existing >>       * queued IBs or discard them on SIGKILL >>      */ >> -   if ((current->flags & PF_SIGNALED) && current->exit_code == >> SIGKILL) >> -       entity->fini_status = -ERESTARTSYS; >> -   else >> -       entity->fini_status = wait_event_killable(sched->job_scheduled, >> -                   drm_sched_entity_is_idle(entity)); >> -   drm_sched_entity_set_rq(entity, NULL); >> +   if (current->flags & PF_EXITING) { >> +       if (timeout) { >> +           ret = jiffies_to_msecs( >> +                   wait_event_timeout( >> +                           sched->job_scheduled, >> +                           drm_sched_entity_is_idle(entity), >> +                           msecs_to_jiffies(timeout))); > > Oh please don't use msecs as timeout, just use jiffies and let the > caller do the conversion. > >> + >> +           if (!ret) >> +               ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS; > > Why that? It is common coding style to return 0 when a timeout occurs. > > Christian. What should i return when i do wait_event_killable, it's return values are opposite to wait_event_timeout... This way returning 0 has no impact on remaining waiting time, dong it the other way will force the caller to do some cumbersome logic instead of just max_wait = max_wait >= ret ? max_wait - ret : 0; like in amdgpu_ctx_mgr_entity_fini Andrey > >> +       } >> +   } else >> +       wait_event_killable(sched->job_scheduled, >> drm_sched_entity_is_idle(entity)); >> + >> + >> +   /* For killed process disable any more IBs enqueue right now */ >> +   if ((current->flags & PF_EXITING) && (current->exit_code == >> SIGKILL)) >> +       drm_sched_entity_set_rq(entity, NULL); >> + >> +   return ret; >>  } >>  EXPORT_SYMBOL(drm_sched_entity_do_release); >>  @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); >>   * >>   * This should be called after @drm_sched_entity_do_release. It >> goes over the >>   * entity and signals all jobs with an error code if the process >> was killed. >> + * >>   */ >>  void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, >>                 struct drm_sched_entity *entity) >>  { >> -   if (entity->fini_status) { >> + >> +   drm_sched_entity_set_rq(entity, NULL); >> + >> +   /* Consumption of existing IBs wasn't completed. Forcefully >> +    * remove them here. >> +    */ >> +   if (spsc_queue_peek(&entity->job_queue)) { >>          struct drm_sched_job *job; >>          int r; >>  @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct >> drm_gpu_scheduler *sched, >>              struct drm_sched_fence *s_fence = job->s_fence; >>              drm_sched_fence_scheduled(s_fence); >>              dma_fence_set_error(&s_fence->finished, -ESRCH); >> -           r = dma_fence_add_callback(entity->last_scheduled, >> &job->finish_cb, >> -                           drm_sched_entity_kill_jobs_cb); >> -           if (r == -ENOENT) >> + >> +           /* >> +            * When pipe is hanged by older entity, new entity might >> +            * not even have chance to submit it's first job to HW >> +            * and so entity->last_scheduled will remain NULL >> +            */ >> +           if (!entity->last_scheduled) { >>                  drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); >> -           else if (r) >> -               DRM_ERROR("fence add callback failed (%d)\n", r); >> +           } else { >> +               r = dma_fence_add_callback(entity->last_scheduled, >> &job->finish_cb, >> +                               drm_sched_entity_kill_jobs_cb); >> +               if (r == -ENOENT) >> +                   drm_sched_entity_kill_jobs_cb(NULL, >> &job->finish_cb); >> +               else if (r) >> +                   DRM_ERROR("fence add callback failed (%d)\n", r); >> +           } >>          } >>      } >>  @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); >>  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, >>                  struct drm_sched_entity *entity) >>  { >> -   drm_sched_entity_do_release(sched, entity); >> +   drm_sched_entity_do_release(sched, entity, >> MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS); >>      drm_sched_entity_cleanup(sched, entity); >>  } >>  EXPORT_SYMBOL(drm_sched_entity_fini); >> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h >> index 496442f..af07875 100644 >> --- a/include/drm/gpu_scheduler.h >> +++ b/include/drm/gpu_scheduler.h >> @@ -27,6 +27,8 @@ >>  #include <drm/spsc_queue.h> >>  #include <linux/dma-fence.h> >>  +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000 > > I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix. > > Christian. > >> + >>  struct drm_gpu_scheduler; >>  struct drm_sched_rq; >>  @@ -84,7 +86,6 @@ struct drm_sched_entity { >>      struct dma_fence       *dependency; >>      struct dma_fence_cb       cb; >>      atomic_t           *guilty; >> -   int                            fini_status; >>      struct dma_fence               *last_scheduled; >>  }; >>  @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct >> drm_gpu_scheduler *sched, >>                struct drm_sched_entity *entity, >>                struct drm_sched_rq *rq, >>                atomic_t *guilty); >> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >> -              struct drm_sched_entity *entity); >> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >> +              struct drm_sched_entity *entity, unsigned timeout); >>  void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, >>                 struct drm_sched_entity *entity); >>  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, >