Am 01.06.2018 um 21:56 schrieb Andrey Grodzovsky: > > > On 06/01/2018 01:22 PM, Christian König wrote: >> Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky: >>> Dying process might be blocked from receiving any more signals >>> so avoid using it. >>> >>> Also retire enity->fini_status and just check the SW queue, >>> if it's not empty do the fallback cleanup. >>> >>> Also handle entity->last_scheduled == NULL use case which >>> happens when HW ring is already hangged whem a new entity >>> tried to enqeue jobs. >>> >>> v2: >>> Return the remaining timeout and use that as parameter for the next >>> call. >>> This way when we need to cleanup multiple queues we don't wait for the >>> entire TO period for each queue but rather in total. >>> Styling comments. >>> Rebase. >>> >>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> >>> --- >>>  drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 >>> ++++++++++++++++++++++++------- >>>  include/drm/gpu_scheduler.h              | 7 +-- >>>  2 files changed, 61 insertions(+), 20 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c >>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c >>> index 8c1e80c..c594d17 100644 >>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c >>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c >>> @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct >>> drm_gpu_scheduler *sched, >>>      entity->rq = rq; >>>      entity->sched = sched; >>>      entity->guilty = guilty; >>> -   entity->fini_status = 0; >>>      entity->last_scheduled = NULL; >>>       spin_lock_init(&entity->rq_lock); >>> @@ -219,7 +218,8 @@ static bool >>> drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, >>>  static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) >>>  { >>>      rmb(); >>> -   if (spsc_queue_peek(&entity->job_queue) == NULL) >>> + >>> +   if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) >>>          return true; >>>       return false; >>> @@ -260,25 +260,48 @@ static void >>> drm_sched_entity_kill_jobs_cb(struct dma_fence *f, >>>   * >>>   * @sched: scheduler instance >>>   * @entity: scheduler entity >>> + * @timeout: time to wait in ms for Q to become empty. >>>   * >>>   * Splitting drm_sched_entity_fini() into two functions, The first >>> one does the waiting, >>>   * removes the entity from the runqueue and returns an error when >>> the process was killed. >>> + * >>> + * Returns amount of time spent in waiting for TO. >>> + * 0 if wait wasn't with time out. >>> + * MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with >>> condition false >>> + * Number of MS spent in waiting before condition became true >>> + * >>>   */ >>> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >>> -              struct drm_sched_entity *entity) >>> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >>> +              struct drm_sched_entity *entity, unsigned timeout) >> >> Better use long for return type and timeout. >> >>>  { >>> +   unsigned ret = 0; >> >> Also use a long here and initialize it with timeout. > > Please see bellow > >> >>> + >>>      if (!drm_sched_entity_is_initialized(sched, entity)) >>>          return; >>>      /** >>>       * The client will not queue more IBs during this fini, >>> consume existing >>>       * queued IBs or discard them on SIGKILL >>>      */ >>> -   if ((current->flags & PF_SIGNALED) && current->exit_code == >>> SIGKILL) >>> -       entity->fini_status = -ERESTARTSYS; >>> -   else >>> -       entity->fini_status = >>> wait_event_killable(sched->job_scheduled, >>> -                   drm_sched_entity_is_idle(entity)); >>> -   drm_sched_entity_set_rq(entity, NULL); >>> +   if (current->flags & PF_EXITING) { >>> +       if (timeout) { >>> +           ret = jiffies_to_msecs( >>> +                   wait_event_timeout( >>> +                           sched->job_scheduled, >>> +                           drm_sched_entity_is_idle(entity), >>> +                           msecs_to_jiffies(timeout))); >> >> Oh please don't use msecs as timeout, just use jiffies and let the >> caller do the conversion. >> >>> + >>> +           if (!ret) >>> +               ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS; >> >> Why that? It is common coding style to return 0 when a timeout occurs. >> >> Christian. > > What should i return when i do wait_event_killable, it's return values > are opposite to wait_event_timeout... Just the unmodified timeout. The timeout should begin only after the process is killed. > This way returning 0 has no impact on remaining waiting time, dong it > the other way will force the caller > to do some cumbersome logic instead of just > > max_wait = max_wait >= ret ? max_wait - ret : 0; > > like in amdgpu_ctx_mgr_entity_fini Hui? Why do you want to fiddle with the max_wait here all together? The usual pattern of using timeouts with multiple wait_event_timeout calls is the following: timeout = MAX_TIMEOUT; while (more_events_to_handle) {    timeout = wait_event_timeout(... timeout);    if (timeout == 0)       break; } if (timeout == 0)    we_timeout_out_waiting_for_all_events(); Christian. > > Andrey >> >>> +       } >>> +   } else >>> +       wait_event_killable(sched->job_scheduled, >>> drm_sched_entity_is_idle(entity)); >>> + >>> + >>> +   /* For killed process disable any more IBs enqueue right now */ >>> +   if ((current->flags & PF_EXITING) && (current->exit_code == >>> SIGKILL)) >>> +       drm_sched_entity_set_rq(entity, NULL); >>> + >>> +   return ret; >>>  } >>>  EXPORT_SYMBOL(drm_sched_entity_do_release); >>>  @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); >>>   * >>>   * This should be called after @drm_sched_entity_do_release. It >>> goes over the >>>   * entity and signals all jobs with an error code if the process >>> was killed. >>> + * >>>   */ >>>  void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, >>>                 struct drm_sched_entity *entity) >>>  { >>> -   if (entity->fini_status) { >>> + >>> +   drm_sched_entity_set_rq(entity, NULL); >>> + >>> +   /* Consumption of existing IBs wasn't completed. Forcefully >>> +    * remove them here. >>> +    */ >>> +   if (spsc_queue_peek(&entity->job_queue)) { >>>          struct drm_sched_job *job; >>>          int r; >>>  @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct >>> drm_gpu_scheduler *sched, >>>              struct drm_sched_fence *s_fence = job->s_fence; >>>              drm_sched_fence_scheduled(s_fence); >>>              dma_fence_set_error(&s_fence->finished, -ESRCH); >>> -           r = dma_fence_add_callback(entity->last_scheduled, >>> &job->finish_cb, >>> -                           drm_sched_entity_kill_jobs_cb); >>> -           if (r == -ENOENT) >>> + >>> +           /* >>> +            * When pipe is hanged by older entity, new entity might >>> +            * not even have chance to submit it's first job to HW >>> +            * and so entity->last_scheduled will remain NULL >>> +            */ >>> +           if (!entity->last_scheduled) { >>>                  drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); >>> -           else if (r) >>> -               DRM_ERROR("fence add callback failed (%d)\n", r); >>> +           } else { >>> +               r = dma_fence_add_callback(entity->last_scheduled, >>> &job->finish_cb, >>> + drm_sched_entity_kill_jobs_cb); >>> +               if (r == -ENOENT) >>> +                   drm_sched_entity_kill_jobs_cb(NULL, >>> &job->finish_cb); >>> +               else if (r) >>> +                   DRM_ERROR("fence add callback failed (%d)\n", r); >>> +           } >>>          } >>>      } >>>  @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); >>>  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, >>>                  struct drm_sched_entity *entity) >>>  { >>> -   drm_sched_entity_do_release(sched, entity); >>> +   drm_sched_entity_do_release(sched, entity, >>> MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS); >>>      drm_sched_entity_cleanup(sched, entity); >>>  } >>>  EXPORT_SYMBOL(drm_sched_entity_fini); >>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h >>> index 496442f..af07875 100644 >>> --- a/include/drm/gpu_scheduler.h >>> +++ b/include/drm/gpu_scheduler.h >>> @@ -27,6 +27,8 @@ >>>  #include <drm/spsc_queue.h> >>>  #include <linux/dma-fence.h> >>>  +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000 >> >> I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix. >> >> Christian. >> >>> + >>>  struct drm_gpu_scheduler; >>>  struct drm_sched_rq; >>>  @@ -84,7 +86,6 @@ struct drm_sched_entity { >>>      struct dma_fence       *dependency; >>>      struct dma_fence_cb       cb; >>>      atomic_t           *guilty; >>> -   int                            fini_status; >>>      struct dma_fence               *last_scheduled; >>>  }; >>>  @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct >>> drm_gpu_scheduler *sched, >>>                struct drm_sched_entity *entity, >>>                struct drm_sched_rq *rq, >>>                atomic_t *guilty); >>> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >>> -              struct drm_sched_entity *entity); >>> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, >>> +              struct drm_sched_entity *entity, unsigned timeout); >>>  void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, >>>                 struct drm_sched_entity *entity); >>>  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, >> > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx