A I understand you say that by the time the fence callback runs the job might have already been released, but how so if the job gets released from drm_sched_job_finish work handler in the normal flow - so, after the HW fence (s_fence->parent) cb is executed. Other 2 flows are error use cases where amdgpu_job_free is called directly in which cases I assume the job wasn't submitted to HW. Last flow I see is drm_sched_entity_kill_jobs_cb and here I actually see a problem with the code as it's today - drm_sched_fence_finished is called which will trigger s_fence->finished callback to run which today schedules drm_sched_job_finish which releases the job, but we don't even wait for that and call free_job cb directly from after that which seems wrong to me. Andrey On 12/10/2018 09:45 PM, Zhou, David(ChunMing) wrote: > I don't think adding cb to sched job would work as soon as their lifetime is different with fence. > Unless you make the sched job reference, otherwise we will get trouble sooner or later. > > -David > >> -----Original Message----- >> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of >> Andrey Grodzovsky >> Sent: Tuesday, December 11, 2018 5:44 AM >> To: dri-devel@xxxxxxxxxxxxxxxxxxxxx; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; >> ckoenig.leichtzumerken@xxxxxxxxx; eric@xxxxxxxxxx; >> etnaviv@xxxxxxxxxxxxxxxxxxxxx >> Cc: Zhou, David(ChunMing) <David1.Zhou@xxxxxxx>; Liu, Monk >> <Monk.Liu@xxxxxxx>; Grodzovsky, Andrey >> <Andrey.Grodzovsky@xxxxxxx> >> Subject: [PATCH v3 2/2] drm/sched: Rework HW fence processing. >> >> Expedite job deletion from ring mirror list to the HW fence signal callback >> instead from finish_work, together with waiting for all such fences to signal in >> drm_sched_stop we garantee that already signaled job will not be processed >> twice. >> Remove the sched finish fence callback and just submit finish_work directly >> from the HW fence callback. >> >> v2: Fix comments. >> >> v3: Attach hw fence cb to sched_job >> >> Suggested-by: Christian Koenig <Christian.Koenig@xxxxxxx> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >> --- >> drivers/gpu/drm/scheduler/sched_main.c | 58 ++++++++++++++++---------- >> -------- >> include/drm/gpu_scheduler.h | 6 ++-- >> 2 files changed, 30 insertions(+), 34 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >> b/drivers/gpu/drm/scheduler/sched_main.c >> index cdf95e2..f0c1f32 100644 >> --- a/drivers/gpu/drm/scheduler/sched_main.c >> +++ b/drivers/gpu/drm/scheduler/sched_main.c >> @@ -284,8 +284,6 @@ static void drm_sched_job_finish(struct work_struct >> *work) >> cancel_delayed_work_sync(&sched->work_tdr); >> >> spin_lock_irqsave(&sched->job_list_lock, flags); >> - /* remove job from ring_mirror_list */ >> - list_del_init(&s_job->node); >> /* queue TDR for next job */ >> drm_sched_start_timeout(sched); >> spin_unlock_irqrestore(&sched->job_list_lock, flags); @@ -293,22 >> +291,11 @@ static void drm_sched_job_finish(struct work_struct *work) >> sched->ops->free_job(s_job); >> } >> >> -static void drm_sched_job_finish_cb(struct dma_fence *f, >> - struct dma_fence_cb *cb) >> -{ >> - struct drm_sched_job *job = container_of(cb, struct drm_sched_job, >> - finish_cb); >> - schedule_work(&job->finish_work); >> -} >> - >> static void drm_sched_job_begin(struct drm_sched_job *s_job) { >> struct drm_gpu_scheduler *sched = s_job->sched; >> unsigned long flags; >> >> - dma_fence_add_callback(&s_job->s_fence->finished, &s_job- >>> finish_cb, >> - drm_sched_job_finish_cb); >> - >> spin_lock_irqsave(&sched->job_list_lock, flags); >> list_add_tail(&s_job->node, &sched->ring_mirror_list); >> drm_sched_start_timeout(sched); >> @@ -359,12 +346,11 @@ void drm_sched_stop(struct drm_gpu_scheduler >> *sched, struct drm_sched_job *bad, >> list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) >> { >> if (s_job->s_fence->parent && >> dma_fence_remove_callback(s_job->s_fence->parent, >> - &s_job->s_fence->cb)) { >> + &s_job->cb)) { >> dma_fence_put(s_job->s_fence->parent); >> s_job->s_fence->parent = NULL; >> atomic_dec(&sched->hw_rq_count); >> - } >> - else { >> + } else { >> /* TODO Is it get/put neccessey here ? */ >> dma_fence_get(&s_job->s_fence->finished); >> list_add(&s_job->finish_node, &wait_list); @@ - >> 417,31 +403,34 @@ EXPORT_SYMBOL(drm_sched_stop); void >> drm_sched_start(struct drm_gpu_scheduler *sched, bool unpark_only) { >> struct drm_sched_job *s_job, *tmp; >> - unsigned long flags; >> int r; >> >> if (unpark_only) >> goto unpark; >> >> - spin_lock_irqsave(&sched->job_list_lock, flags); >> + /* >> + * Locking the list is not required here as the sched thread is parked >> + * so no new jobs are being pushed in to HW and in drm_sched_stop >> we >> + * flushed all the jobs who were still in mirror list but who already >> + * signaled and removed them self from the list. Also concurrent >> + * GPU recovers can't run in parallel. >> + */ >> list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >> node) { >> - struct drm_sched_fence *s_fence = s_job->s_fence; >> struct dma_fence *fence = s_job->s_fence->parent; >> >> if (fence) { >> - r = dma_fence_add_callback(fence, &s_fence->cb, >> + r = dma_fence_add_callback(fence, &s_job->cb, >> drm_sched_process_job); >> if (r == -ENOENT) >> - drm_sched_process_job(fence, &s_fence- >>> cb); >> + drm_sched_process_job(fence, &s_job->cb); >> else if (r) >> DRM_ERROR("fence add callback failed >> (%d)\n", >> r); >> } else >> - drm_sched_process_job(NULL, &s_fence->cb); >> + drm_sched_process_job(NULL, &s_job->cb); >> } >> >> drm_sched_start_timeout(sched); >> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >> >> unpark: >> kthread_unpark(sched->thread); >> @@ -590,18 +579,27 @@ drm_sched_select_entity(struct >> drm_gpu_scheduler *sched) >> */ >> static void drm_sched_process_job(struct dma_fence *f, struct >> dma_fence_cb *cb) { >> - struct drm_sched_fence *s_fence = >> - container_of(cb, struct drm_sched_fence, cb); >> + struct drm_sched_job *s_job = container_of(cb, struct >> drm_sched_job, cb); >> + struct drm_sched_fence *s_fence = s_job->s_fence; >> struct drm_gpu_scheduler *sched = s_fence->sched; >> + unsigned long flags; >> + >> + cancel_delayed_work(&sched->work_tdr); >> >> - dma_fence_get(&s_fence->finished); >> atomic_dec(&sched->hw_rq_count); >> atomic_dec(&sched->num_jobs); >> + >> + spin_lock_irqsave(&sched->job_list_lock, flags); >> + /* remove job from ring_mirror_list */ >> + list_del_init(&s_job->node); >> + spin_unlock_irqrestore(&sched->job_list_lock, flags); >> + >> drm_sched_fence_finished(s_fence); >> >> trace_drm_sched_process_job(s_fence); >> - dma_fence_put(&s_fence->finished); >> wake_up_interruptible(&sched->wake_up_worker); >> + >> + schedule_work(&s_job->finish_work); >> } >> >> /** >> @@ -664,16 +662,16 @@ static int drm_sched_main(void *param) >> >> if (fence) { >> s_fence->parent = dma_fence_get(fence); >> - r = dma_fence_add_callback(fence, &s_fence->cb, >> + r = dma_fence_add_callback(fence, &sched_job->cb, >> drm_sched_process_job); >> if (r == -ENOENT) >> - drm_sched_process_job(fence, &s_fence- >>> cb); >> + drm_sched_process_job(fence, &sched_job- >>> cb); >> else if (r) >> DRM_ERROR("fence add callback failed >> (%d)\n", >> r); >> dma_fence_put(fence); >> } else >> - drm_sched_process_job(NULL, &s_fence->cb); >> + drm_sched_process_job(NULL, &sched_job->cb); >> >> wake_up(&sched->job_scheduled); >> } >> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h >> index c94b592..f29aa1c 100644 >> --- a/include/drm/gpu_scheduler.h >> +++ b/include/drm/gpu_scheduler.h >> @@ -138,10 +138,6 @@ struct drm_sched_fence { >> struct dma_fence finished; >> >> /** >> - * @cb: the callback for the parent fence below. >> - */ >> - struct dma_fence_cb cb; >> - /** >> * @parent: the fence returned by &drm_sched_backend_ops.run_job >> * when scheduling the job on hardware. We signal the >> * &drm_sched_fence.finished fence once parent is signalled. >> @@ -182,6 +178,7 @@ struct drm_sched_fence >> *to_drm_sched_fence(struct dma_fence *f); >> * be scheduled further. >> * @s_priority: the priority of the job. >> * @entity: the entity to which this job belongs. >> + * @cb: the callback for the parent fence in s_fence. >> * >> * A job is created by the driver using drm_sched_job_init(), and >> * should call drm_sched_entity_push_job() once it wants the scheduler >> @@ -199,6 +196,7 @@ struct drm_sched_job { >> atomic_t karma; >> enum drm_sched_priority s_priority; >> struct drm_sched_entity *entity; >> + struct dma_fence_cb cb; >> }; >> >> static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, >> -- >> 2.7.4 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx