Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job

Christian König <christian.koenig@xxxxxxx> · Thu, 20 Sep 2018 13:30:09 +0200



    Am 20.09.2018 um 13:25 schrieb Nayan
      Deshmukh:

    
            On Wed, Sep 19, 2018, 9:31 PM Christian König
              <christian.koenig@xxxxxxx>
              wrote:

            
            Am
              18.09.2018 um 18:17 schrieb Nayan Deshmukh:

              > having a delayed work item per job is redundant as we
              only need one

              > per scheduler to track the time out the currently
              executing job.

              
              Well that looks simpler than I thought it would be.

              
              But it shows the next problem that the timeout and the
              completion could 

              race.

              
              As far as I can see that can be fixed by moving the 

              dma_fence_remove_callback()/dma_fence_add_callback() dance
              from 

              drm_sched_hw_job_reset() to drm_sched_job_timedout().

              
              Anyway, I would say drop patch #1 and fix the one comment
              below and we 

              can use this.

              
              >

              > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@xxxxxxxxx>

              > Suggested-by: Christian König <christian.koenig@xxxxxxx>

              > ---

              >   drivers/gpu/drm/scheduler/sched_main.c | 16
              +++++++++-------

              >   include/drm/gpu_scheduler.h            |  6 +++---

              >   2 files changed, 12 insertions(+), 10 deletions(-)

              >

              > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
              b/drivers/gpu/drm/scheduler/sched_main.c

              > index 0e6ccc8243db..f213b5c7f718 100644

              > --- a/drivers/gpu/drm/scheduler/sched_main.c

              > +++ b/drivers/gpu/drm/scheduler/sched_main.c

              > @@ -198,7 +198,7 @@ static void
              drm_sched_job_finish(struct work_struct *work)

              >        * manages to find this job as the next job in
              the list, the fence

              >        * signaled check below will prevent the
              timeout to be restarted.

              >        */

              > -   
               cancel_delayed_work_sync(&s_job->work_tdr);

              > +   
               cancel_delayed_work_sync(&sched->work_tdr);

              >   

              >       spin_lock(&sched->job_list_lock);

              >       /* queue TDR for next job */

              > @@ -207,7 +207,7 @@ static void
              drm_sched_job_finish(struct work_struct *work)

              >       if (sched->timeout != MAX_SCHEDULE_TIMEOUT
              &&

              >           !list_is_last(&s_job->node,
              &sched->ring_mirror_list)) {

              >               if
              (!dma_fence_is_signaled(&next->s_fence->finished))

              
              Since we now have only one delayed work item we can just
              drop the test 

              if next is already signaled.

            
        Can you
            elaborate more on this. Which test are you talking about?

        
    I was talking about the "!dma_fence_is_signaled()" test here.

    
          Regards,

          Nayan

        
              Regards,

              Christian.

              
              > -                   
               schedule_delayed_work(&next->work_tdr,
              sched->timeout);

              > +                   
               schedule_delayed_work(&sched->work_tdr,
              sched->timeout);

              >       }

              >       /* remove job from ring_mirror_list */

              >       list_del(&s_job->node);

            
    Basically you could do this first and then you need to only test if
    sched->ring_mirror_list is empty.

    
    Regards,

    Christian.

    
              > @@ -237,7 +237,7 @@ static void
              drm_sched_job_begin(struct drm_sched_job *s_job)

              >       if
              (list_first_entry_or_null(&sched->ring_mirror_list,

              >                               struct drm_sched_job,
              node) == s_job) {

              >               if (sched->timeout !=
              MAX_SCHEDULE_TIMEOUT)

              > -                   
               schedule_delayed_work(&s_job->work_tdr,
              sched->timeout);

              > +                   
               schedule_delayed_work(&sched->work_tdr,
              sched->timeout);

              >               sched->curr_job = s_job;

              >       }

              >       spin_unlock(&sched->job_list_lock);

              > @@ -245,8 +245,10 @@ static void
              drm_sched_job_begin(struct drm_sched_job *s_job)

              >   

              >   static void drm_sched_job_timedout(struct
              work_struct *work)

              >   {

              > -     struct drm_sched_job *job = container_of(work,
              struct drm_sched_job,

              > -                                              work_tdr.work);

              > +     struct drm_gpu_scheduler *sched =
              container_of(work,

              > +                                             struct
              drm_gpu_scheduler,

              > +                                             work_tdr.work);

              > +     struct drm_sched_job *job = sched->curr_job;

              >   

              >       job->sched->ops->timedout_job(job);

              >   }

              > @@ -318,7 +320,7 @@ void
              drm_sched_job_recovery(struct drm_gpu_scheduler *sched)

              >       s_job =
              list_first_entry_or_null(&sched->ring_mirror_list,

              >                                        struct
              drm_sched_job, node);

              >       if (s_job && sched->timeout !=
              MAX_SCHEDULE_TIMEOUT)

              > -           
               schedule_delayed_work(&s_job->work_tdr,
              sched->timeout);

              > +           
               schedule_delayed_work(&sched->work_tdr,
              sched->timeout);

              >       if (s_job)

              >               sched->curr_job = s_job;

              >   

              > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct
              drm_sched_job *job,

              >   

              >       INIT_WORK(&job->finish_work,
              drm_sched_job_finish);

              >       INIT_LIST_HEAD(&job->node);

              > -     INIT_DELAYED_WORK(&job->work_tdr,
              drm_sched_job_timedout);

              >   

              >       return 0;

              >   }

              > @@ -580,6 +581,7 @@ int drm_sched_init(struct
              drm_gpu_scheduler *sched,

              >     
               INIT_LIST_HEAD(&sched->ring_mirror_list);

              >       spin_lock_init(&sched->job_list_lock);

              >       atomic_set(&sched->hw_rq_count, 0);

              > +     INIT_DELAYED_WORK(&sched->work_tdr,
              drm_sched_job_timedout);

              >       atomic_set(&sched->num_jobs, 0);

              >       atomic64_set(&sched->job_id_count, 0);

              >   

              > diff --git a/include/drm/gpu_scheduler.h
              b/include/drm/gpu_scheduler.h

              > index 07e776b1ca42..9d50d7f3eaa4 100644

              > --- a/include/drm/gpu_scheduler.h

              > +++ b/include/drm/gpu_scheduler.h

              > @@ -175,8 +175,6 @@ struct drm_sched_fence
              *to_drm_sched_fence(struct dma_fence *f);

              >    *               finished to remove the job from
              the

              >    *             
               @drm_gpu_scheduler.ring_mirror_list.

              >    * @node: used to append this struct to the
              @drm_gpu_scheduler.ring_mirror_list.

              > - * @work_tdr: schedules a delayed call to
              @drm_sched_job_timedout after the timeout

              > - *            interval is over.

              >    * @id: a unique id assigned to each job scheduled
              on the scheduler.

              >    * @karma: increment on every hang caused by this
              job. If this exceeds the hang

              >    *         limit of the scheduler then the job is
              marked guilty and will not

              > @@ -195,7 +193,6 @@ struct drm_sched_job {

              >       struct dma_fence_cb             finish_cb;

              >       struct work_struct              finish_work;

              >       struct list_head                node;

              > -     struct delayed_work             work_tdr;

              >       uint64_t                        id;

              >       atomic_t                        karma;

              >       enum drm_sched_priority         s_priority;

              > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {

              >    *                 finished.

              >    * @hw_rq_count: the number of jobs currently in
              the hardware queue.

              >    * @job_id_count: used to assign unique id to the
              each job.

              > + * @work_tdr: schedules a delayed call to
              @drm_sched_job_timedout after the

              > + *            timeout interval is over.

              >    * @thread: the kthread on which the scheduler
              which run.

              >    * @ring_mirror_list: the list of jobs which are
              currently in the job queue.

              >    * @job_list_lock: lock to protect the
              ring_mirror_list.

              > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {

              >       wait_queue_head_t               job_scheduled;

              >       atomic_t                        hw_rq_count;

              >       atomic64_t                      job_id_count;

              > +     struct delayed_work             work_tdr;

              >       struct task_struct              *thread;

              >       struct list_head               
              ring_mirror_list;

              >       spinlock_t                      job_list_lock;

              
_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel