[PATCH v2 1/2] drm/scheduler: Avoid using wait_event_killable for dying process.

ckoenig.leichtzumerken@xxxxxxxxx (Christian König) · Sun, 3 Jun 2018 20:52:49 +0200

Am 01.06.2018 um 21:56 schrieb Andrey Grodzovsky:
>
>
> On 06/01/2018 01:22 PM, Christian KÃ¶nig wrote:
>> Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky:
>>> Dying process might be blocked from receiving any more signals
>>> so avoid using it.
>>>
>>> Also retire enity->fini_status and just check the SW queue,
>>> if it's not empty do the fallback cleanup.
>>>
>>> Also handle entity->last_scheduled == NULL use case which
>>> happens when HW ring is already hangged whem aÂ  new entity
>>> tried to enqeue jobs.
>>>
>>> v2:
>>> Return the remaining timeout and use that as parameter for the next 
>>> call.
>>> This way when we need to cleanup multiple queues we don't wait for the
>>> entire TO period for each queue but rather in total.
>>> Styling comments.
>>> Rebase.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>> ---
>>> Â  drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 
>>> ++++++++++++++++++++++++-------
>>> Â  include/drm/gpu_scheduler.hÂ Â Â Â Â Â Â Â Â Â Â Â Â Â  |Â  7 +--
>>> Â  2 files changed, 61 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c 
>>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>> index 8c1e80c..c594d17 100644
>>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>> @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct 
>>> drm_gpu_scheduler *sched,
>>> Â Â Â Â Â  entity->rq = rq;
>>> Â Â Â Â Â  entity->sched = sched;
>>> Â Â Â Â Â  entity->guilty = guilty;
>>> -Â Â Â  entity->fini_status = 0;
>>> Â Â Â Â Â  entity->last_scheduled = NULL;
>>> Â  Â Â Â Â Â  spin_lock_init(&entity->rq_lock);
>>> @@ -219,7 +218,8 @@ static bool 
>>> drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
>>> Â  static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity)
>>> Â  {
>>> Â Â Â Â Â  rmb();
>>> -Â Â Â  if (spsc_queue_peek(&entity->job_queue) == NULL)
>>> +
>>> +Â Â Â  if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
>>> Â Â Â Â Â Â Â Â Â  return true;
>>> Â  Â Â Â Â Â  return false;
>>> @@ -260,25 +260,48 @@ static void 
>>> drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>>> Â Â  *
>>> Â Â  * @sched: scheduler instance
>>> Â Â  * @entity: scheduler entity
>>> + * @timeout: time to wait in ms for Q to become empty.
>>> Â Â  *
>>> Â Â  * Splitting drm_sched_entity_fini() into two functions, The first 
>>> one does the waiting,
>>> Â Â  * removes the entity from the runqueue and returns an error when 
>>> the process was killed.
>>> + *
>>> + * Returns amount of time spent in waiting for TO.
>>> + * 0 if wait wasn't with time out.
>>> + * MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with 
>>> condition false
>>> + * Number of MS spent in waiting before condition became true
>>> + *
>>> Â Â  */
>>> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>> -Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity)
>>> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity, unsigned timeout)
>>
>> Better use long for return type and timeout.
>>
>>> Â  {
>>> +Â Â Â  unsigned ret = 0;
>>
>> Also use a long here and initialize it with timeout.
>
> Please see bellow
>
>>
>>> +
>>> Â Â Â Â Â  if (!drm_sched_entity_is_initialized(sched, entity))
>>> Â Â Â Â Â Â Â Â Â  return;
>>> Â Â Â Â Â  /**
>>> Â Â Â Â Â Â  * The client will not queue more IBs during this fini, 
>>> consume existing
>>> Â Â Â Â Â Â  * queued IBs or discard them on SIGKILL
>>> Â Â Â Â Â  */
>>> -Â Â Â  if ((current->flags & PF_SIGNALED) && current->exit_code == 
>>> SIGKILL)
>>> -Â Â Â Â Â Â Â  entity->fini_status = -ERESTARTSYS;
>>> -Â Â Â  else
>>> -Â Â Â Â Â Â Â  entity->fini_status = 
>>> wait_event_killable(sched->job_scheduled,
>>> -Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  drm_sched_entity_is_idle(entity));
>>> -Â Â Â  drm_sched_entity_set_rq(entity, NULL);
>>> +Â Â Â  if (current->flags & PF_EXITING) {
>>> +Â Â Â Â Â Â Â  if (timeout) {
>>> +Â Â Â Â Â Â Â Â Â Â Â  ret = jiffies_to_msecs(
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  wait_event_timeout(
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  sched->job_scheduled,
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  drm_sched_entity_is_idle(entity),
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  msecs_to_jiffies(timeout)));
>>
>> Oh please don't use msecs as timeout, just use jiffies and let the 
>> caller do the conversion.
>>
>>> +
>>> +Â Â Â Â Â Â Â Â Â Â Â  if (!ret)
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
>>
>> Why that? It is common coding style to return 0 when a timeout occurs.
>>
>> Christian.
>
> What should i return when i do wait_event_killable, it's return values 
> are opposite to wait_event_timeout...

Just the unmodified timeout. The timeout should begin only after the 
process is killed.

> This way returning 0 has no impact on remaining waiting time, dong it 
> the other way will force the caller
> to do some cumbersome logic instead of just
>
> max_wait = max_wait >= ret ? max_wait - ret : 0;
>
> like in amdgpu_ctx_mgr_entity_fini

Hui? Why do you want to fiddle with the max_wait here all together?

The usual pattern of using timeouts with multiple wait_event_timeout 
calls is the following:

timeout = MAX_TIMEOUT;
while (more_events_to_handle) {
 Â Â Â  timeout = wait_event_timeout(... timeout);
 Â Â Â  if (timeout == 0)
 Â Â Â  Â Â Â  break;
}

if (timeout == 0)
 Â Â Â  we_timeout_out_waiting_for_all_events();

Christian.

>
> Andrey
>>
>>> +Â Â Â Â Â Â Â  }
>>> +Â Â Â  } else
>>> +Â Â Â Â Â Â Â  wait_event_killable(sched->job_scheduled, 
>>> drm_sched_entity_is_idle(entity));
>>> +
>>> +
>>> +Â Â Â  /* For killed process disable any more IBs enqueue right now */
>>> +Â Â Â  if ((current->flags & PF_EXITING) && (current->exit_code == 
>>> SIGKILL))
>>> +Â Â Â Â Â Â Â  drm_sched_entity_set_rq(entity, NULL);
>>> +
>>> +Â Â Â  return ret;
>>> Â  }
>>> Â  EXPORT_SYMBOL(drm_sched_entity_do_release);
>>> Â  @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release);
>>> Â Â  *
>>> Â Â  * This should be called after @drm_sched_entity_do_release. It 
>>> goes over the
>>> Â Â  * entity and signals all jobs with an error code if the process 
>>> was killed.
>>> + *
>>> Â Â  */
>>> Â  void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity)
>>> Â  {
>>> -Â Â Â  if (entity->fini_status) {
>>> +
>>> +Â Â Â  drm_sched_entity_set_rq(entity, NULL);
>>> +
>>> +Â Â Â  /* Consumption of existing IBs wasn't completed. Forcefully
>>> +Â Â Â Â  * remove them here.
>>> +Â Â Â Â  */
>>> +Â Â Â  if (spsc_queue_peek(&entity->job_queue)) {
>>> Â Â Â Â Â Â Â Â Â  struct drm_sched_job *job;
>>> Â Â Â Â Â Â Â Â Â  int r;
>>> Â  @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct 
>>> drm_gpu_scheduler *sched,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_fence *s_fence = job->s_fence;
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â  drm_sched_fence_scheduled(s_fence);
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â  dma_fence_set_error(&s_fence->finished, -ESRCH);
>>> -Â Â Â Â Â Â Â Â Â Â Â  r = dma_fence_add_callback(entity->last_scheduled, 
>>> &job->finish_cb,
>>> -Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  drm_sched_entity_kill_jobs_cb);
>>> -Â Â Â Â Â Â Â Â Â Â Â  if (r == -ENOENT)
>>> +
>>> +Â Â Â Â Â Â Â Â Â Â Â  /*
>>> +Â Â Â Â Â Â Â Â Â Â Â Â  * When pipe is hanged by older entity, new entity might
>>> +Â Â Â Â Â Â Â Â Â Â Â Â  * not even have chance to submit it's first job to HW
>>> +Â Â Â Â Â Â Â Â Â Â Â Â  * and so entity->last_scheduled will remain NULL
>>> +Â Â Â Â Â Â Â Â Â Â Â Â  */
>>> +Â Â Â Â Â Â Â Â Â Â Â  if (!entity->last_scheduled) {
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>>> -Â Â Â Â Â Â Â Â Â Â Â  else if (r)
>>> -Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  DRM_ERROR("fence add callback failed (%d)\n", r);
>>> +Â Â Â Â Â Â Â Â Â Â Â  } else {
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  r = dma_fence_add_callback(entity->last_scheduled, 
>>> &job->finish_cb,
>>> + drm_sched_entity_kill_jobs_cb);
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  if (r == -ENOENT)
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  drm_sched_entity_kill_jobs_cb(NULL, 
>>> &job->finish_cb);
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  else if (r)
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  DRM_ERROR("fence add callback failed (%d)\n", r);
>>> +Â Â Â Â Â Â Â Â Â Â Â  }
>>> Â Â Â Â Â Â Â Â Â  }
>>> Â Â Â Â Â  }
>>> Â  @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup);
>>> Â  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity)
>>> Â  {
>>> -Â Â Â  drm_sched_entity_do_release(sched, entity);
>>> +Â Â Â  drm_sched_entity_do_release(sched, entity, 
>>> MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS);
>>> Â Â Â Â Â  drm_sched_entity_cleanup(sched, entity);
>>> Â  }
>>> Â  EXPORT_SYMBOL(drm_sched_entity_fini);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 496442f..af07875 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -27,6 +27,8 @@
>>> Â  #include <drm/spsc_queue.h>
>>> Â  #include <linux/dma-fence.h>
>>> Â  +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000
>>
>> I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix.
>>
>> Christian.
>>
>>> +
>>> Â  struct drm_gpu_scheduler;
>>> Â  struct drm_sched_rq;
>>> Â  @@ -84,7 +86,6 @@ struct drm_sched_entity {
>>> Â Â Â Â Â  struct dma_fenceÂ Â Â Â Â Â Â  *dependency;
>>> Â Â Â Â Â  struct dma_fence_cbÂ Â Â Â Â Â Â  cb;
>>> Â Â Â Â Â  atomic_tÂ Â Â Â Â Â Â Â Â Â Â  *guilty;
>>> -Â Â Â  intÂ Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  fini_status;
>>> Â Â Â Â Â  struct dma_fenceÂ Â Â Â Â Â Â Â Â Â Â Â Â Â Â  *last_scheduled;
>>> Â  };
>>> Â  @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct 
>>> drm_gpu_scheduler *sched,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_rq *rq,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  atomic_t *guilty);
>>> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>> -Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity);
>>> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity, unsigned timeout);
>>> Â  void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  struct drm_sched_entity *entity);
>>> Â  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx