On 12/06/2018 01:33 PM, Christian König wrote: > Am 06.12.18 um 18:41 schrieb Andrey Grodzovsky: >> Decauple sched threads stop and start and ring mirror >> list handling from the policy of what to do about the >> guilty jobs. >> When stoppping the sched thread and detaching sched fences >> from non signaled HW fenes wait for all signaled HW fences >> to complete before rerunning the jobs. >> >> Suggested-by: Christian Koenig <Christian.Koenig@xxxxxxx> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> > > Just briefly skimmed over this, but it looks exactly like what I had > in mind. > > Need to give that a more detailed thought tomorrow, > Christian. Please note I've already resent V2 after finding refactoring error. Andrey > >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--- >> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 8 +-- >> drivers/gpu/drm/scheduler/sched_main.c | 86 >> +++++++++++++++++++----------- >> drivers/gpu/drm/v3d/v3d_sched.c | 11 ++-- >> include/drm/gpu_scheduler.h | 10 ++-- >> 5 files changed, 83 insertions(+), 49 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index ef36cc5..42111d5 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3292,17 +3292,16 @@ static int >> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, >> /* block all schedulers and reset given job's ring */ >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> struct amdgpu_ring *ring = adev->rings[i]; >> + bool park_only = job && job->base.sched != &ring->sched; >> if (!ring || !ring->sched.thread) >> continue; >> - kthread_park(ring->sched.thread); >> + drm_sched_stop(&ring->sched, job ? &job->base : NULL, >> park_only); >> - if (job && job->base.sched != &ring->sched) >> + if (park_only) >> continue; >> - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : >> NULL); >> - >> /* after all hw jobs are reset, hw fence is meaningless, so >> force_completion */ >> amdgpu_fence_driver_force_completion(ring); >> } >> @@ -3445,6 +3444,7 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >> struct amdgpu_job *job) >> { >> int i; >> + bool unpark_only; >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> struct amdgpu_ring *ring = adev->rings[i]; >> @@ -3456,10 +3456,13 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >> * or all rings (in the case @job is NULL) >> * after above amdgpu_reset accomplished >> */ >> - if ((!job || job->base.sched == &ring->sched) && >> !adev->asic_reset_res) >> - drm_sched_job_recovery(&ring->sched); >> + unpark_only = (job && job->base.sched != &ring->sched) || >> + adev->asic_reset_res; >> + >> + if (!unpark_only) >> + drm_sched_resubmit_jobs(&ring->sched); >> - kthread_unpark(ring->sched.thread); >> + drm_sched_start(&ring->sched, unpark_only); >> } >> if (!amdgpu_device_has_dc_support(adev)) { >> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> index 49a6763..fab3b51 100644 >> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct >> drm_sched_job *sched_job) >> } >> /* block scheduler */ >> - kthread_park(gpu->sched.thread); >> - drm_sched_hw_job_reset(&gpu->sched, sched_job); >> + drm_sched_stop(&gpu->sched, sched_job, false); >> /* get the GPU back into the init state */ >> etnaviv_core_dump(gpu); >> etnaviv_gpu_recover_hang(gpu); >> + drm_sched_resubmit_jobs(&gpu->sched); >> + >> /* restart scheduler after GPU is usable again */ >> - drm_sched_job_recovery(&gpu->sched); >> - kthread_unpark(gpu->sched.thread); >> + drm_sched_start(&gpu->sched); >> } >> static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) >> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >> b/drivers/gpu/drm/scheduler/sched_main.c >> index dbb6906..8fb7f86 100644 >> --- a/drivers/gpu/drm/scheduler/sched_main.c >> +++ b/drivers/gpu/drm/scheduler/sched_main.c >> @@ -60,8 +60,6 @@ >> static void drm_sched_process_job(struct dma_fence *f, struct >> dma_fence_cb *cb); >> -static void drm_sched_expel_job_unlocked(struct drm_sched_job >> *s_job); >> - >> /** >> * drm_sched_rq_init - initialize a given run queue struct >> * >> @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct >> work_struct *work) >> * @bad: bad scheduler job >> * >> */ >> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad) >> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad, >> + bool park_only) >> { >> struct drm_sched_job *s_job; >> struct drm_sched_entity *entity, *tmp; >> unsigned long flags; >> + struct list_head wait_list; >> int i; >> + kthread_park(sched->thread); >> + if (park_only) >> + return; >> + >> + INIT_LIST_HEAD(&wait_list); >> + >> spin_lock_irqsave(&sched->job_list_lock, flags); >> list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, >> node) { >> if (s_job->s_fence->parent && >> @@ -358,9 +364,24 @@ void drm_sched_hw_job_reset(struct >> drm_gpu_scheduler *sched, struct drm_sched_jo >> s_job->s_fence->parent = NULL; >> atomic_dec(&sched->hw_rq_count); >> } >> + else { >> + /* TODO Is it get/put neccessey here ? */ >> + dma_fence_get(&s_job->s_fence->finished); >> + list_add(&s_job->finish_node, &wait_list); >> + } >> } >> spin_unlock_irqrestore(&sched->job_list_lock, flags); >> + /* >> + * Verify all the signaled jobs in mirror list are removed from >> the ring >> + * We rely on the fact that any finish_work in progress will >> wait for this >> + * handler to complete before releasing all of the jobs we iterate. >> + */ >> + list_for_each_entry(s_job, &wait_list, finish_node) { >> + dma_fence_wait(&s_job->s_fence->finished, false); >> + dma_fence_put(&s_job->s_fence->finished); >> + } >> + >> if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { >> atomic_inc(&bad->karma); >> /* don't increase @bad's karma if it's from KERNEL RQ, >> @@ -385,7 +406,7 @@ void drm_sched_hw_job_reset(struct >> drm_gpu_scheduler *sched, struct drm_sched_jo >> } >> } >> } >> -EXPORT_SYMBOL(drm_sched_hw_job_reset); >> +EXPORT_SYMBOL(drm_sched_stop); >> /** >> * drm_sched_job_recovery - recover jobs after a reset >> @@ -393,14 +414,17 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset); >> * @sched: scheduler instance >> * >> */ >> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) >> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool unpark_only) >> { >> struct drm_sched_job *s_job, *tmp; >> bool found_guilty = false; >> unsigned long flags; >> int r; >> - spin_lock_irqsave(&sched->job_list_lock, flags); >> + if (unpark_only) >> + goto unpark; >> + >> + spin_lock_irqsave(&sched->job_list_lock, flags); >> list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >> node) { >> struct drm_sched_fence *s_fence = s_job->s_fence; >> struct dma_fence *fence; >> @@ -414,12 +438,9 @@ void drm_sched_job_recovery(struct >> drm_gpu_scheduler *sched) >> if (found_guilty && s_job->s_fence->scheduled.context == >> guilty_context) >> dma_fence_set_error(&s_fence->finished, -ECANCELED); >> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >> - fence = sched->ops->run_job(s_job); >> - atomic_inc(&sched->hw_rq_count); >> + fence = s_job->s_fence->parent; >> if (fence) { >> - s_fence->parent = dma_fence_get(fence); >> r = dma_fence_add_callback(fence, &s_fence->cb, >> drm_sched_process_job); >> if (r == -ENOENT) >> @@ -427,18 +448,35 @@ void drm_sched_job_recovery(struct >> drm_gpu_scheduler *sched) >> else if (r) >> DRM_ERROR("fence add callback failed (%d)\n", >> r); >> - dma_fence_put(fence); >> - } else { >> - if (s_fence->finished.error < 0) >> - drm_sched_expel_job_unlocked(s_job); >> + } else >> drm_sched_process_job(NULL, &s_fence->cb); >> - } >> - spin_lock_irqsave(&sched->job_list_lock, flags); >> } >> + >> drm_sched_start_timeout(sched); >> spin_unlock_irqrestore(&sched->job_list_lock, flags); >> + >> +unpark: >> + kthread_unpark(sched->thread); >> } >> -EXPORT_SYMBOL(drm_sched_job_recovery); >> +EXPORT_SYMBOL(drm_sched_start); >> + >> +/** >> + * drm_sched_resubmit_jobs - helper to relunch job from mirror ring >> list >> + * >> + * @sched: scheduler instance >> + * >> + */ >> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) >> +{ >> + struct drm_sched_job *s_job, *tmp; >> + >> + /*TODO DO we need spinlock here ? */ >> + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >> node) { >> + s_job->s_fence->parent = sched->ops->run_job(s_job); >> + atomic_inc(&sched->hw_rq_count); >> + } >> +} >> +EXPORT_SYMBOL(drm_sched_resubmit_jobs); >> /** >> * drm_sched_job_init - init a scheduler job >> @@ -634,26 +672,14 @@ static int drm_sched_main(void *param) >> DRM_ERROR("fence add callback failed (%d)\n", >> r); >> dma_fence_put(fence); >> - } else { >> - if (s_fence->finished.error < 0) >> - drm_sched_expel_job_unlocked(sched_job); >> + } else >> drm_sched_process_job(NULL, &s_fence->cb); >> - } >> wake_up(&sched->job_scheduled); >> } >> return 0; >> } >> -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job) >> -{ >> - struct drm_gpu_scheduler *sched = s_job->sched; >> - >> - spin_lock(&sched->job_list_lock); >> - list_del_init(&s_job->node); >> - spin_unlock(&sched->job_list_lock); >> -} >> - >> /** >> * drm_sched_init - Init a gpu scheduler instance >> * >> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c >> b/drivers/gpu/drm/v3d/v3d_sched.c >> index 445b2ef..f99346a 100644 >> --- a/drivers/gpu/drm/v3d/v3d_sched.c >> +++ b/drivers/gpu/drm/v3d/v3d_sched.c >> @@ -178,18 +178,19 @@ v3d_job_timedout(struct drm_sched_job *sched_job) >> for (q = 0; q < V3D_MAX_QUEUES; q++) { >> struct drm_gpu_scheduler *sched = &v3d->queue[q].sched; >> - kthread_park(sched->thread); >> - drm_sched_hw_job_reset(sched, (sched_job->sched == sched ? >> - sched_job : NULL)); >> + drm_sched_stop(sched, (sched_job->sched == sched ? >> + sched_job : NULL), false); >> } >> /* get the GPU back into the init state */ >> v3d_reset(v3d); >> + for (q = 0; q < V3D_MAX_QUEUES; q++) >> + drm_sched_resubmit_jobs(sched_job->sched); >> + >> /* Unblock schedulers and restart their jobs. */ >> for (q = 0; q < V3D_MAX_QUEUES; q++) { >> - drm_sched_job_recovery(&v3d->queue[q].sched); >> - kthread_unpark(v3d->queue[q].sched.thread); >> + drm_sched_start(&v3d->queue[q].sched, false); >> } >> mutex_unlock(&v3d->reset_lock); >> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h >> index 47e1979..c94b592 100644 >> --- a/include/drm/gpu_scheduler.h >> +++ b/include/drm/gpu_scheduler.h >> @@ -175,6 +175,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct >> dma_fence *f); >> * finished to remove the job from the >> * @drm_gpu_scheduler.ring_mirror_list. >> * @node: used to append this struct to the >> @drm_gpu_scheduler.ring_mirror_list. >> + * @finish_node: used in a list to wait on before resetting the >> scheduler >> * @id: a unique id assigned to each job scheduled on the scheduler. >> * @karma: increment on every hang caused by this job. If this >> exceeds the hang >> * limit of the scheduler then the job is marked guilty and >> will not >> @@ -193,6 +194,7 @@ struct drm_sched_job { >> struct dma_fence_cb finish_cb; >> struct work_struct finish_work; >> struct list_head node; >> + struct list_head finish_node; >> uint64_t id; >> atomic_t karma; >> enum drm_sched_priority s_priority; >> @@ -298,9 +300,11 @@ int drm_sched_job_init(struct drm_sched_job *job, >> void *owner); >> void drm_sched_job_cleanup(struct drm_sched_job *job); >> void drm_sched_wakeup(struct drm_gpu_scheduler *sched); >> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, >> - struct drm_sched_job *job); >> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched); >> +void drm_sched_stop(struct drm_gpu_scheduler *sched, >> + struct drm_sched_job *job, >> + bool park_only); >> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >> unpark_only); >> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); >> bool drm_sched_dependency_optimized(struct dma_fence* fence, >> struct drm_sched_entity *entity); >> void drm_sched_fault(struct drm_gpu_scheduler *sched); > _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel