On 12/19/2018 11:21 AM, Christian König wrote: > Am 17.12.18 um 20:51 schrieb Andrey Grodzovsky: >> Decauple sched threads stop and start and ring mirror >> list handling from the policy of what to do about the >> guilty jobs. >> When stoppping the sched thread and detaching sched fences >> from non signaled HW fenes wait for all signaled HW fences >> to complete before rerunning the jobs. >> >> v2: Fix resubmission of guilty job into HW after refactoring. >> >> v4: >> Full restart for all the jobs, not only from guilty ring. >> Extract karma increase into standalone function. >> >> Suggested-by: Christian Koenig <Christian.Koenig@xxxxxxx> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +-- >> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 8 +- >> drivers/gpu/drm/scheduler/sched_main.c | 164 >> +++++++++++++++++------------ >> drivers/gpu/drm/v3d/v3d_sched.c | 9 +- >> include/drm/gpu_scheduler.h | 9 +- >> 5 files changed, 118 insertions(+), 89 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 8a078f4..8ac4f43 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3298,12 +3298,7 @@ static int amdgpu_device_pre_asic_reset(struct >> amdgpu_device *adev, >> if (!ring || !ring->sched.thread) >> continue; >> - kthread_park(ring->sched.thread); >> - >> - if (job && job->base.sched != &ring->sched) >> - continue; >> - >> - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); >> + drm_sched_stop(&ring->sched, job ? &job->base : NULL); >> /* after all hw jobs are reset, hw fence is meaningless, >> so force_completion */ >> amdgpu_fence_driver_force_completion(ring); >> @@ -3454,14 +3449,10 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >> if (!ring || !ring->sched.thread) >> continue; >> - /* only need recovery sched of the given job's ring >> - * or all rings (in the case @job is NULL) >> - * after above amdgpu_reset accomplished >> - */ >> - if ((!job || job->base.sched == &ring->sched) && >> !adev->asic_reset_res) >> - drm_sched_job_recovery(&ring->sched); >> + if (!adev->asic_reset_res) >> + drm_sched_resubmit_jobs(&ring->sched); >> - kthread_unpark(ring->sched.thread); >> + drm_sched_start(&ring->sched, !adev->asic_reset_res); >> } >> if (!amdgpu_device_has_dc_support(adev)) { >> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> index 49a6763..d7075cd 100644 >> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct >> drm_sched_job *sched_job) >> } >> /* block scheduler */ >> - kthread_park(gpu->sched.thread); >> - drm_sched_hw_job_reset(&gpu->sched, sched_job); >> + drm_sched_stop(&gpu->sched, sched_job); >> /* get the GPU back into the init state */ >> etnaviv_core_dump(gpu); >> etnaviv_gpu_recover_hang(gpu); >> + drm_sched_resubmit_jobs(&gpu->sched); >> + >> /* restart scheduler after GPU is usable again */ >> - drm_sched_job_recovery(&gpu->sched); >> - kthread_unpark(gpu->sched.thread); >> + drm_sched_start(&gpu->sched, true); >> } >> static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) >> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >> b/drivers/gpu/drm/scheduler/sched_main.c >> index dbb6906..1cf9541 100644 >> --- a/drivers/gpu/drm/scheduler/sched_main.c >> +++ b/drivers/gpu/drm/scheduler/sched_main.c >> @@ -60,8 +60,6 @@ >> static void drm_sched_process_job(struct dma_fence *f, struct >> dma_fence_cb *cb); >> -static void drm_sched_expel_job_unlocked(struct drm_sched_job >> *s_job); >> - >> /** >> * drm_sched_rq_init - initialize a given run queue struct >> * >> @@ -335,6 +333,41 @@ static void drm_sched_job_timedout(struct >> work_struct *work) >> spin_unlock_irqrestore(&sched->job_list_lock, flags); >> } >> +static void drm_sched_increase_karma(struct drm_sched_job *bad) >> +{ >> + int i; >> + struct drm_sched_entity *tmp; >> + struct drm_sched_entity *entity; >> + struct drm_gpu_scheduler *sched = bad->sched; >> + >> + /* don't increase @bad's karma if it's from KERNEL RQ, >> + * because sometimes GPU hang would cause kernel jobs (like VM >> updating jobs) >> + * corrupt but keep in mind that kernel jobs always considered >> good. >> + */ >> + if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { >> + atomic_inc(&bad->karma); >> + for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; >> + i++) { >> + struct drm_sched_rq *rq = &sched->sched_rq[i]; >> + >> + spin_lock(&rq->lock); >> + list_for_each_entry_safe(entity, tmp, &rq->entities, >> list) { >> + if (bad->s_fence->scheduled.context == >> + entity->fence_context) { >> + if (atomic_read(&bad->karma) > >> + bad->sched->hang_limit) >> + if (entity->guilty) >> + atomic_set(entity->guilty, 1); >> + break; >> + } >> + } >> + spin_unlock(&rq->lock); >> + if (&entity->list != &rq->entities) >> + break; >> + } >> + } >> +} >> + >> /** >> * drm_sched_hw_job_reset - stop the scheduler if it contains the >> bad job >> * >> @@ -342,12 +375,15 @@ static void drm_sched_job_timedout(struct >> work_struct *work) >> * @bad: bad scheduler job >> * >> */ >> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad) >> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad) >> { >> struct drm_sched_job *s_job; >> - struct drm_sched_entity *entity, *tmp; >> unsigned long flags; >> - int i; >> + struct list_head wait_list; >> + >> + kthread_park(sched->thread); >> + >> + INIT_LIST_HEAD(&wait_list); >> spin_lock_irqsave(&sched->job_list_lock, flags); >> list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, >> node) { >> @@ -357,35 +393,28 @@ void drm_sched_hw_job_reset(struct >> drm_gpu_scheduler *sched, struct drm_sched_jo >> dma_fence_put(s_job->s_fence->parent); >> s_job->s_fence->parent = NULL; >> atomic_dec(&sched->hw_rq_count); >> + } else { >> + /* TODO Is it get/put neccessey here ? */ >> + dma_fence_get(&s_job->s_fence->finished); >> + list_add(&s_job->finish_node, &wait_list); >> } >> } >> spin_unlock_irqrestore(&sched->job_list_lock, flags); >> - if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { >> - atomic_inc(&bad->karma); >> - /* don't increase @bad's karma if it's from KERNEL RQ, >> - * becuase sometimes GPU hang would cause kernel jobs (like >> VM updating jobs) >> - * corrupt but keep in mind that kernel jobs always >> considered good. >> - */ >> - for (i = DRM_SCHED_PRIORITY_MIN; i < >> DRM_SCHED_PRIORITY_KERNEL; i++ ) { >> - struct drm_sched_rq *rq = &sched->sched_rq[i]; >> - >> - spin_lock(&rq->lock); >> - list_for_each_entry_safe(entity, tmp, &rq->entities, >> list) { >> - if (bad->s_fence->scheduled.context == >> entity->fence_context) { >> - if (atomic_read(&bad->karma) > >> bad->sched->hang_limit) >> - if (entity->guilty) >> - atomic_set(entity->guilty, 1); >> - break; >> - } >> - } >> - spin_unlock(&rq->lock); >> - if (&entity->list != &rq->entities) >> - break; >> - } >> + /* >> + * Verify all the signaled jobs in mirror list are removed from >> the ring >> + * We rely on the fact that any finish_work in progress will >> wait for this >> + * handler to complete before releasing all of the jobs we iterate. >> + */ >> + list_for_each_entry(s_job, &wait_list, finish_node) { >> + dma_fence_wait(&s_job->s_fence->finished, false); >> + dma_fence_put(&s_job->s_fence->finished); >> } > > Mhm, thinking more about it that approach won't work. > > Only the scheduler instance with the timeout will block on the GPU > reset to finish before freeing up s_job. > > The job_list_lock will protect us, but as soon as that is dropped we > are potentially working with freed up jobs here. > > Maybe your idea of just waiting for the last finished fence to signal > wasn't so bad after all. This way you only need to grab the last one > and wait for it. I was concerned with relying on the assumption saying that if last job (it's fence actually) in ring_mirror_list is signaled then necessary all the previous jobs/fences in the list have already signaled - does the order of submission into HW ring (same as order of insertion into ring_mirror_list ) implies necessary same order of job/fence completion ? I know we talked about it already but I still not sure, is this ALWAYS true ? What about instead of using the jobs as elements for the wait_list we can just locally here dynamically allocate some struct fence_cont { struct list_head node; struct dma_fence *finished; } and then we can iterate that list and wait for the fence to signal without worrying for the jobs being already released ? Andrey > > And since fences are reference counted we also don't run into any > problems because of that. > >> + >> + if (bad) >> + drm_sched_increase_karma(bad); > > Better export drm_sched_increase_karma() and let the drivers call that > implicitly. > > Regards, > Christian. > >> } >> -EXPORT_SYMBOL(drm_sched_hw_job_reset); >> +EXPORT_SYMBOL(drm_sched_stop); >> /** >> * drm_sched_job_recovery - recover jobs after a reset >> @@ -393,33 +422,21 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset); >> * @sched: scheduler instance >> * >> */ >> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) >> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >> full_recovery) >> { >> struct drm_sched_job *s_job, *tmp; >> - bool found_guilty = false; >> unsigned long flags; >> int r; >> + if (!full_recovery) >> + goto unpark; >> + >> spin_lock_irqsave(&sched->job_list_lock, flags); >> list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >> node) { >> struct drm_sched_fence *s_fence = s_job->s_fence; >> - struct dma_fence *fence; >> - uint64_t guilty_context; >> - >> - if (!found_guilty && atomic_read(&s_job->karma) > >> sched->hang_limit) { >> - found_guilty = true; >> - guilty_context = s_job->s_fence->scheduled.context; >> - } >> - >> - if (found_guilty && s_job->s_fence->scheduled.context == >> guilty_context) >> - dma_fence_set_error(&s_fence->finished, -ECANCELED); >> - >> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >> - fence = sched->ops->run_job(s_job); >> - atomic_inc(&sched->hw_rq_count); >> + struct dma_fence *fence = s_job->s_fence->parent; >> if (fence) { >> - s_fence->parent = dma_fence_get(fence); >> r = dma_fence_add_callback(fence, &s_fence->cb, >> drm_sched_process_job); >> if (r == -ENOENT) >> @@ -427,18 +444,47 @@ void drm_sched_job_recovery(struct >> drm_gpu_scheduler *sched) >> else if (r) >> DRM_ERROR("fence add callback failed (%d)\n", >> r); >> - dma_fence_put(fence); >> - } else { >> - if (s_fence->finished.error < 0) >> - drm_sched_expel_job_unlocked(s_job); >> + } else >> drm_sched_process_job(NULL, &s_fence->cb); >> - } >> - spin_lock_irqsave(&sched->job_list_lock, flags); >> } >> + >> drm_sched_start_timeout(sched); >> spin_unlock_irqrestore(&sched->job_list_lock, flags); >> + >> +unpark: >> + kthread_unpark(sched->thread); >> } >> -EXPORT_SYMBOL(drm_sched_job_recovery); >> +EXPORT_SYMBOL(drm_sched_start); >> + >> +/** >> + * drm_sched_resubmit_jobs - helper to relunch job from mirror ring >> list >> + * >> + * @sched: scheduler instance >> + * >> + */ >> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) >> +{ >> + struct drm_sched_job *s_job, *tmp; >> + uint64_t guilty_context; >> + bool found_guilty = false; >> + >> + /*TODO DO we need spinlock here ? */ >> + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >> node) { >> + struct drm_sched_fence *s_fence = s_job->s_fence; >> + >> + if (!found_guilty && atomic_read(&s_job->karma) > >> sched->hang_limit) { >> + found_guilty = true; >> + guilty_context = s_job->s_fence->scheduled.context; >> + } >> + >> + if (found_guilty && s_job->s_fence->scheduled.context == >> guilty_context) >> + dma_fence_set_error(&s_fence->finished, -ECANCELED); >> + >> + s_job->s_fence->parent = sched->ops->run_job(s_job); >> + atomic_inc(&sched->hw_rq_count); >> + } >> +} >> +EXPORT_SYMBOL(drm_sched_resubmit_jobs); >> /** >> * drm_sched_job_init - init a scheduler job >> @@ -634,26 +680,14 @@ static int drm_sched_main(void *param) >> DRM_ERROR("fence add callback failed (%d)\n", >> r); >> dma_fence_put(fence); >> - } else { >> - if (s_fence->finished.error < 0) >> - drm_sched_expel_job_unlocked(sched_job); >> + } else >> drm_sched_process_job(NULL, &s_fence->cb); >> - } >> wake_up(&sched->job_scheduled); >> } >> return 0; >> } >> -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job) >> -{ >> - struct drm_gpu_scheduler *sched = s_job->sched; >> - >> - spin_lock(&sched->job_list_lock); >> - list_del_init(&s_job->node); >> - spin_unlock(&sched->job_list_lock); >> -} >> - >> /** >> * drm_sched_init - Init a gpu scheduler instance >> * >> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c >> b/drivers/gpu/drm/v3d/v3d_sched.c >> index 445b2ef..aaa10b0 100644 >> --- a/drivers/gpu/drm/v3d/v3d_sched.c >> +++ b/drivers/gpu/drm/v3d/v3d_sched.c >> @@ -178,18 +178,19 @@ v3d_job_timedout(struct drm_sched_job *sched_job) >> for (q = 0; q < V3D_MAX_QUEUES; q++) { >> struct drm_gpu_scheduler *sched = &v3d->queue[q].sched; >> - kthread_park(sched->thread); >> - drm_sched_hw_job_reset(sched, (sched_job->sched == sched ? >> + drm_sched_stop(sched, (sched_job->sched == sched ? >> sched_job : NULL)); >> } >> /* get the GPU back into the init state */ >> v3d_reset(v3d); >> + for (q = 0; q < V3D_MAX_QUEUES; q++) >> + drm_sched_resubmit_jobs(sched_job->sched); >> + >> /* Unblock schedulers and restart their jobs. */ >> for (q = 0; q < V3D_MAX_QUEUES; q++) { >> - drm_sched_job_recovery(&v3d->queue[q].sched); >> - kthread_unpark(v3d->queue[q].sched.thread); >> + drm_sched_start(&v3d->queue[q].sched, true); >> } >> mutex_unlock(&v3d->reset_lock); >> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h >> index 47e1979..441384c 100644 >> --- a/include/drm/gpu_scheduler.h >> +++ b/include/drm/gpu_scheduler.h >> @@ -175,6 +175,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct >> dma_fence *f); >> * finished to remove the job from the >> * @drm_gpu_scheduler.ring_mirror_list. >> * @node: used to append this struct to the >> @drm_gpu_scheduler.ring_mirror_list. >> + * @finish_node: used in a list to wait on before resetting the >> scheduler >> * @id: a unique id assigned to each job scheduled on the scheduler. >> * @karma: increment on every hang caused by this job. If this >> exceeds the hang >> * limit of the scheduler then the job is marked guilty and >> will not >> @@ -193,6 +194,7 @@ struct drm_sched_job { >> struct dma_fence_cb finish_cb; >> struct work_struct finish_work; >> struct list_head node; >> + struct list_head finish_node; >> uint64_t id; >> atomic_t karma; >> enum drm_sched_priority s_priority; >> @@ -298,9 +300,10 @@ int drm_sched_job_init(struct drm_sched_job *job, >> void *owner); >> void drm_sched_job_cleanup(struct drm_sched_job *job); >> void drm_sched_wakeup(struct drm_gpu_scheduler *sched); >> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, >> - struct drm_sched_job *job); >> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched); >> +void drm_sched_stop(struct drm_gpu_scheduler *sched, >> + struct drm_sched_job *job); >> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >> full_recovery); >> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); >> bool drm_sched_dependency_optimized(struct dma_fence* fence, >> struct drm_sched_entity *entity); >> void drm_sched_fault(struct drm_gpu_scheduler *sched); > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel