On 01/07/2019 09:13 AM, Christian König wrote: > Am 03.01.19 um 18:42 schrieb Grodzovsky, Andrey: >> >> On 01/03/2019 11:20 AM, Grodzovsky, Andrey wrote: >>> On 01/03/2019 03:54 AM, Koenig, Christian wrote: >>>> Am 21.12.18 um 21:36 schrieb Grodzovsky, Andrey: >>>>> On 12/21/2018 01:37 PM, Christian König wrote: >>>>>> Am 20.12.18 um 20:23 schrieb Andrey Grodzovsky: >>>>>>> Decauple sched threads stop and start and ring mirror >>>>>>> list handling from the policy of what to do about the >>>>>>> guilty jobs. >>>>>>> When stoppping the sched thread and detaching sched fences >>>>>>> from non signaled HW fenes wait for all signaled HW fences >>>>>>> to complete before rerunning the jobs. >>>>>>> >>>>>>> v2: Fix resubmission of guilty job into HW after refactoring. >>>>>>> >>>>>>> v4: >>>>>>> Full restart for all the jobs, not only from guilty ring. >>>>>>> Extract karma increase into standalone function. >>>>>>> >>>>>>> v5: >>>>>>> Rework waiting for signaled jobs without relying on the job >>>>>>> struct itself as those might already be freed for non 'guilty' >>>>>>> job's schedulers. >>>>>>> Expose karma increase to drivers. >>>>>>> >>>>>>> Suggested-by: Christian Koenig <Christian.Koenig@xxxxxxx> >>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >>>>>>> --- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +-- >>>>>>> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 11 +- >>>>>>> drivers/gpu/drm/scheduler/sched_main.c | 188 >>>>>>> +++++++++++++++++++---------- >>>>>>> drivers/gpu/drm/v3d/v3d_sched.c | 12 +- >>>>>>> include/drm/gpu_scheduler.h | 10 +- >>>>>>> 5 files changed, 151 insertions(+), 88 deletions(-) >>>>>>> >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> index 8a078f4..a4bd2d3 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> @@ -3298,12 +3298,10 @@ static int >>>>>>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, >>>>>>> if (!ring || !ring->sched.thread) >>>>>>> continue; >>>>>>> - kthread_park(ring->sched.thread); >>>>>>> + drm_sched_stop(&ring->sched, job ? &job->base : NULL); >>>>>>> - if (job && job->base.sched != &ring->sched) >>>>>>> - continue; >>>>>>> - >>>>>>> - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : >>>>>>> NULL); >>>>>>> + if(job) >>>>>>> + drm_sched_increase_karma(&job->base); >>>>>> Since we dropped the "job && job->base.sched != &ring->sched" check >>>>>> above this will now increase the jobs karma multiple times. >>>>>> >>>>>> Maybe just move that outside of the loop. >>>>>> >>>>>>> /* after all hw jobs are reset, hw fence is >>>>>>> meaningless, >>>>>>> so force_completion */ >>>>>>> amdgpu_fence_driver_force_completion(ring); >>>>>>> @@ -3454,14 +3452,10 @@ static void >>>>>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >>>>>>> if (!ring || !ring->sched.thread) >>>>>>> continue; >>>>>>> - /* only need recovery sched of the given job's ring >>>>>>> - * or all rings (in the case @job is NULL) >>>>>>> - * after above amdgpu_reset accomplished >>>>>>> - */ >>>>>>> - if ((!job || job->base.sched == &ring->sched) && >>>>>>> !adev->asic_reset_res) >>>>>>> - drm_sched_job_recovery(&ring->sched); >>>>>>> + if (!adev->asic_reset_res) >>>>>>> + drm_sched_resubmit_jobs(&ring->sched); >>>>>>> - kthread_unpark(ring->sched.thread); >>>>>>> + drm_sched_start(&ring->sched, !adev->asic_reset_res); >>>>>>> } >>>>>>> if (!amdgpu_device_has_dc_support(adev)) { >>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>> index 49a6763..6f1268f 100644 >>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>> @@ -109,16 +109,19 @@ static void etnaviv_sched_timedout_job(struct >>>>>>> drm_sched_job *sched_job) >>>>>>> } >>>>>>> /* block scheduler */ >>>>>>> - kthread_park(gpu->sched.thread); >>>>>>> - drm_sched_hw_job_reset(&gpu->sched, sched_job); >>>>>>> + drm_sched_stop(&gpu->sched, sched_job); >>>>>>> + >>>>>>> + if(sched_job) >>>>>>> + drm_sched_increase_karma(sched_job); >>>>>>> /* get the GPU back into the init state */ >>>>>>> etnaviv_core_dump(gpu); >>>>>>> etnaviv_gpu_recover_hang(gpu); >>>>>>> + drm_sched_resubmit_jobs(&gpu->sched); >>>>>>> + >>>>>>> /* restart scheduler after GPU is usable again */ >>>>>>> - drm_sched_job_recovery(&gpu->sched); >>>>>>> - kthread_unpark(gpu->sched.thread); >>>>>>> + drm_sched_start(&gpu->sched, true); >>>>>>> } >>>>>>> static void etnaviv_sched_free_job(struct drm_sched_job >>>>>>> *sched_job) >>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c >>>>>>> index dbb6906..b5c5bee 100644 >>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c >>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c >>>>>>> @@ -60,8 +60,6 @@ >>>>>>> static void drm_sched_process_job(struct dma_fence *f, >>>>>>> struct >>>>>>> dma_fence_cb *cb); >>>>>>> -static void drm_sched_expel_job_unlocked(struct >>>>>>> drm_sched_job >>>>>>> *s_job); >>>>>>> - >>>>>>> /** >>>>>>> * drm_sched_rq_init - initialize a given run queue struct >>>>>>> * >>>>>>> @@ -335,6 +333,42 @@ static void drm_sched_job_timedout(struct >>>>>>> work_struct *work) >>>>>>> spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>> } >>>>>> Kernel doc here would be nice to have. >>>>>> >>>>>>> +void drm_sched_increase_karma(struct drm_sched_job *bad) >>>>>>> +{ >>>>>>> + int i; >>>>>>> + struct drm_sched_entity *tmp; >>>>>>> + struct drm_sched_entity *entity; >>>>>>> + struct drm_gpu_scheduler *sched = bad->sched; >>>>>>> + >>>>>>> + /* don't increase @bad's karma if it's from KERNEL RQ, >>>>>>> + * because sometimes GPU hang would cause kernel jobs (like VM >>>>>>> updating jobs) >>>>>>> + * corrupt but keep in mind that kernel jobs always considered >>>>>>> good. >>>>>>> + */ >>>>>>> + if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { >>>>>>> + atomic_inc(&bad->karma); >>>>>>> + for (i = DRM_SCHED_PRIORITY_MIN; i < >>>>>>> DRM_SCHED_PRIORITY_KERNEL; >>>>>>> + i++) { >>>>>>> + struct drm_sched_rq *rq = &sched->sched_rq[i]; >>>>>>> + >>>>>>> + spin_lock(&rq->lock); >>>>>>> + list_for_each_entry_safe(entity, tmp, &rq->entities, >>>>>>> list) { >>>>>>> + if (bad->s_fence->scheduled.context == >>>>>>> + entity->fence_context) { >>>>>>> + if (atomic_read(&bad->karma) > >>>>>>> + bad->sched->hang_limit) >>>>>>> + if (entity->guilty) >>>>>>> + atomic_set(entity->guilty, 1); >>>>>>> + break; >>>>>>> + } >>>>>>> + } >>>>>>> + spin_unlock(&rq->lock); >>>>>>> + if (&entity->list != &rq->entities) >>>>>>> + break; >>>>>>> + } >>>>>>> + } >>>>>>> +} >>>>>>> +EXPORT_SYMBOL(drm_sched_increase_karma); >>>>>>> + >>>>>>> /** >>>>>>> * drm_sched_hw_job_reset - stop the scheduler if it >>>>>>> contains the >>>>>>> bad job >>>>>>> * >>>>>>> @@ -342,13 +376,22 @@ static void drm_sched_job_timedout(struct >>>>>>> work_struct *work) >>>>>>> * @bad: bad scheduler job >>>>>>> * >>>>>>> */ >>>>>>> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, >>>>>>> struct >>>>>>> drm_sched_job *bad) >>>>>>> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct >>>>>>> drm_sched_job *bad) >>>>>>> { >>>>>>> - struct drm_sched_job *s_job; >>>>>>> - struct drm_sched_entity *entity, *tmp; >>>>>>> + struct drm_sched_job *s_job, *last_job; >>>>>>> unsigned long flags; >>>>>>> - int i; >>>>>>> + struct dma_fence *wait_fence = NULL; >>>>>>> + int r; >>>>>>> + >>>>>>> + kthread_park(sched->thread); >>>>>>> + /* >>>>>>> + * Verify all the signaled jobs in mirror list are removed >>>>>>> from >>>>>>> the ring >>>>>>> + * by waiting for their respective scheduler fences to signal. >>>>>>> + * Continually repeat traversing the ring mirror list >>>>>>> until no >>>>>>> more signaled >>>>>>> + * fences are found >>>>>>> + */ >>>>>>> +retry_wait: >>>>>>> spin_lock_irqsave(&sched->job_list_lock, flags); >>>>>>> list_for_each_entry_reverse(s_job, >>>>>>> &sched->ring_mirror_list, >>>>>>> node) { >>>>>>> if (s_job->s_fence->parent && >>>>>>> @@ -357,35 +400,45 @@ void drm_sched_hw_job_reset(struct >>>>>>> drm_gpu_scheduler *sched, struct drm_sched_jo >>>>>>> dma_fence_put(s_job->s_fence->parent); >>>>>>> s_job->s_fence->parent = NULL; >>>>>>> atomic_dec(&sched->hw_rq_count); >>>>>>> + } else { >>>>>>> + wait_fence = >>>>>>> dma_fence_get(&s_job->s_fence->finished); >>>>>>> + last_job = s_job; >>>>>>> + break; >>>>>>> } >>>>>>> } >>>>>>> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>> - if (bad && bad->s_priority != >>>>>>> DRM_SCHED_PRIORITY_KERNEL) { >>>>>>> - atomic_inc(&bad->karma); >>>>>>> - /* don't increase @bad's karma if it's from KERNEL RQ, >>>>>>> - * becuase sometimes GPU hang would cause kernel jobs >>>>>>> (like >>>>>>> VM updating jobs) >>>>>>> - * corrupt but keep in mind that kernel jobs always >>>>>>> considered good. >>>>>>> - */ >>>>>>> - for (i = DRM_SCHED_PRIORITY_MIN; i < >>>>>>> DRM_SCHED_PRIORITY_KERNEL; i++ ) { >>>>>>> - struct drm_sched_rq *rq = &sched->sched_rq[i]; >>>>>>> + /* No signaled jobs in the ring, its safe to proceed to ASIC >>>>>>> reset */ >>>>>>> + if (!wait_fence) { >>>>>>> + spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>> + goto done; >>>>>>> + } >>>>>>> - spin_lock(&rq->lock); >>>>>>> - list_for_each_entry_safe(entity, tmp, &rq->entities, >>>>>>> list) { >>>>>>> - if (bad->s_fence->scheduled.context == >>>>>>> entity->fence_context) { >>>>>>> - if (atomic_read(&bad->karma) > >>>>>>> bad->sched->hang_limit) >>>>>>> - if (entity->guilty) >>>>>>> - atomic_set(entity->guilty, 1); >>>>>>> - break; >>>>>>> - } >>>>>>> - } >>>>>>> - spin_unlock(&rq->lock); >>>>>>> - if (&entity->list != &rq->entities) >>>>>>> - break; >>>>>>> + /* Restore removed cb since removing again already removed >>>>>>> cb is >>>>>>> undefined */ >>>>>>> + list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, >>>>>>> node) { >>>>>>> + if(s_job == last_job) >>>>>>> + break; >>>>>> Need to double check after the holidays, but you should be able >>>>>> to use >>>>>> list_for_each_entry_continue here. >>>>> I think it should work - kind of traversing back on all the jobs >>>>> we just >>>>> removed their callbacks. >>>> Wrapping your head around stuff again after the holidays sometimes >>>> shows >>>> new problems we have completely missed :) >>>> >>>> Adding the callbacks again won't work because we have freed up our >>>> reference to the hardware fence above: >>>>> dma_fence_put(s_job->s_fence->parent); >>>>> s_job->s_fence->parent = NULL; >>>> We need to drop this or we would never be able to re-add the fences >>> Yea, that a big miss on my side... >>> >>>> . >>>> >>>> But I'm still not sure if we shouldn't rely on the correct order of >>>> signaling and simplify this instead. >>> As you said before, once we switched to signaling the parent from the >>> interrupt context instead of scheduled work no danger of race there >> Correction here - once we switched removing the job from mirror_ring >> list directly in interrupt context instead later from scheduled work > > Ok, so let's stick with the approach of only waiting for the first > signaled one found. > > But we need to remove setting the parent fence to NULL or otherwise we > won't be able to add the callback ever again. > > Christian. But we will not be adding the cb back in drm_sched_stop anymore, now we are only going to add back the cb in drm_sched_startr after rerunning those jobs in drm_sched_resubmit_jobs and assign them a new parent there anyway. Andrey > >> >> Andrey >> >>> , so >>> what if we submit job A and after it job B and B completes before A >>> (like the sync dependency test in libdrm amdgpu tests but without >>> adding >>> explicit dependency to the second command on the first) I believe that >>> still in this case job B's parent (HW) fence will not be signaled >>> before >>> job A completes since EOP event is not signaled until the entire pipe >>> completed and flushed it's cashes including job A. So from this >>> seems to >>> me that indeed it's enough to wait for the last inserted job's parent >>> (HW) fence in ring mirror list to signal. >>> Let me know what you think on that. >>> >>> P.S V5 is not the last iteration and there was V6 series. >>> >>> Andrey >>> >>>> Regards, >>>> Christian. >>>> >>>>> Andrey >>>>> >>>>>>> + >>>>>>> + if (s_job->s_fence->parent) { >>>>>>> + r = dma_fence_add_callback(s_job->s_fence->parent, >>>>>>> + &s_job->s_fence->cb, >>>>>>> + drm_sched_process_job); >>>>>>> + if (r) >>>>>>> + DRM_ERROR("fence restore callback failed (%d)\n", >>>>>>> + r); >>>>>> When you fail to add the callback this means that you need to call >>>>>> call drm_sched_process_job manually here. >>>>>> >>>>>>> } >>>>>>> } >>>>>>> + spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>> + >>>>>>> + dma_fence_wait(wait_fence, false); >>>>>>> + dma_fence_put(wait_fence); >>>>>>> + wait_fence = NULL; >>>>>>> + >>>>>>> + goto retry_wait; >>>>>>> + >>>>>>> +done: >>>>>>> + return; >>>>>> Drop the done: label and return directly above. >>>>>> >>>>>> Apart from all those nit picks that starts to look like it should >>>>>> work, >>>>>> Christian. >>>>>> >>>>>>> } >>>>>>> -EXPORT_SYMBOL(drm_sched_hw_job_reset); >>>>>>> +EXPORT_SYMBOL(drm_sched_stop); >>>>>>> /** >>>>>>> * drm_sched_job_recovery - recover jobs after a reset >>>>>>> @@ -393,33 +446,21 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset); >>>>>>> * @sched: scheduler instance >>>>>>> * >>>>>>> */ >>>>>>> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) >>>>>>> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >>>>>>> full_recovery) >>>>>>> { >>>>>>> struct drm_sched_job *s_job, *tmp; >>>>>>> - bool found_guilty = false; >>>>>>> unsigned long flags; >>>>>>> int r; >>>>>>> + if (!full_recovery) >>>>>>> + goto unpark; >>>>>>> + >>>>>>> spin_lock_irqsave(&sched->job_list_lock, flags); >>>>>>> list_for_each_entry_safe(s_job, tmp, >>>>>>> &sched->ring_mirror_list, >>>>>>> node) { >>>>>>> struct drm_sched_fence *s_fence = s_job->s_fence; >>>>>>> - struct dma_fence *fence; >>>>>>> - uint64_t guilty_context; >>>>>>> - >>>>>>> - if (!found_guilty && atomic_read(&s_job->karma) > >>>>>>> sched->hang_limit) { >>>>>>> - found_guilty = true; >>>>>>> - guilty_context = s_job->s_fence->scheduled.context; >>>>>>> - } >>>>>>> - >>>>>>> - if (found_guilty && s_job->s_fence->scheduled.context == >>>>>>> guilty_context) >>>>>>> - dma_fence_set_error(&s_fence->finished, -ECANCELED); >>>>>>> - >>>>>>> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>> - fence = sched->ops->run_job(s_job); >>>>>>> - atomic_inc(&sched->hw_rq_count); >>>>>>> + struct dma_fence *fence = s_job->s_fence->parent; >>>>>>> if (fence) { >>>>>>> - s_fence->parent = dma_fence_get(fence); >>>>>>> r = dma_fence_add_callback(fence, &s_fence->cb, >>>>>>> drm_sched_process_job); >>>>>>> if (r == -ENOENT) >>>>>>> @@ -427,18 +468,47 @@ void drm_sched_job_recovery(struct >>>>>>> drm_gpu_scheduler *sched) >>>>>>> else if (r) >>>>>>> DRM_ERROR("fence add callback failed (%d)\n", >>>>>>> r); >>>>>>> - dma_fence_put(fence); >>>>>>> - } else { >>>>>>> - if (s_fence->finished.error < 0) >>>>>>> - drm_sched_expel_job_unlocked(s_job); >>>>>>> + } else >>>>>>> drm_sched_process_job(NULL, &s_fence->cb); >>>>>>> - } >>>>>>> - spin_lock_irqsave(&sched->job_list_lock, flags); >>>>>>> } >>>>>>> + >>>>>>> drm_sched_start_timeout(sched); >>>>>>> spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>> + >>>>>>> +unpark: >>>>>>> + kthread_unpark(sched->thread); >>>>>>> +} >>>>>>> +EXPORT_SYMBOL(drm_sched_start); >>>>>>> + >>>>>>> +/** >>>>>>> + * drm_sched_resubmit_jobs - helper to relunch job from mirror >>>>>>> ring >>>>>>> list >>>>>>> + * >>>>>>> + * @sched: scheduler instance >>>>>>> + * >>>>>>> + */ >>>>>>> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) >>>>>>> +{ >>>>>>> + struct drm_sched_job *s_job, *tmp; >>>>>>> + uint64_t guilty_context; >>>>>>> + bool found_guilty = false; >>>>>>> + >>>>>>> + /*TODO DO we need spinlock here ? */ >>>>>>> + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >>>>>>> node) { >>>>>>> + struct drm_sched_fence *s_fence = s_job->s_fence; >>>>>>> + >>>>>>> + if (!found_guilty && atomic_read(&s_job->karma) > >>>>>>> sched->hang_limit) { >>>>>>> + found_guilty = true; >>>>>>> + guilty_context = s_job->s_fence->scheduled.context; >>>>>>> + } >>>>>>> + >>>>>>> + if (found_guilty && s_job->s_fence->scheduled.context == >>>>>>> guilty_context) >>>>>>> + dma_fence_set_error(&s_fence->finished, -ECANCELED); >>>>>>> + >>>>>>> + s_job->s_fence->parent = sched->ops->run_job(s_job); >>>>>>> + atomic_inc(&sched->hw_rq_count); >>>>>>> + } >>>>>>> } >>>>>>> -EXPORT_SYMBOL(drm_sched_job_recovery); >>>>>>> +EXPORT_SYMBOL(drm_sched_resubmit_jobs); >>>>>>> /** >>>>>>> * drm_sched_job_init - init a scheduler job >>>>>>> @@ -634,26 +704,14 @@ static int drm_sched_main(void *param) >>>>>>> DRM_ERROR("fence add callback failed (%d)\n", >>>>>>> r); >>>>>>> dma_fence_put(fence); >>>>>>> - } else { >>>>>>> - if (s_fence->finished.error < 0) >>>>>>> - drm_sched_expel_job_unlocked(sched_job); >>>>>>> + } else >>>>>>> drm_sched_process_job(NULL, &s_fence->cb); >>>>>>> - } >>>>>>> wake_up(&sched->job_scheduled); >>>>>>> } >>>>>>> return 0; >>>>>>> } >>>>>>> -static void drm_sched_expel_job_unlocked(struct >>>>>>> drm_sched_job *s_job) >>>>>>> -{ >>>>>>> - struct drm_gpu_scheduler *sched = s_job->sched; >>>>>>> - >>>>>>> - spin_lock(&sched->job_list_lock); >>>>>>> - list_del_init(&s_job->node); >>>>>>> - spin_unlock(&sched->job_list_lock); >>>>>>> -} >>>>>>> - >>>>>>> /** >>>>>>> * drm_sched_init - Init a gpu scheduler instance >>>>>>> * >>>>>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>> index 445b2ef..f76d9ed 100644 >>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>> @@ -178,18 +178,22 @@ v3d_job_timedout(struct drm_sched_job >>>>>>> *sched_job) >>>>>>> for (q = 0; q < V3D_MAX_QUEUES; q++) { >>>>>>> struct drm_gpu_scheduler *sched = >>>>>>> &v3d->queue[q].sched; >>>>>>> - kthread_park(sched->thread); >>>>>>> - drm_sched_hw_job_reset(sched, (sched_job->sched == sched ? >>>>>>> + drm_sched_stop(sched, (sched_job->sched == sched ? >>>>>>> sched_job : NULL)); >>>>>>> + >>>>>>> + if(sched_job) >>>>>>> + drm_sched_increase_karma(sched_job); >>>>>>> } >>>>>>> /* get the GPU back into the init state */ >>>>>>> v3d_reset(v3d); >>>>>>> + for (q = 0; q < V3D_MAX_QUEUES; q++) >>>>>>> + drm_sched_resubmit_jobs(sched_job->sched); >>>>>>> + >>>>>>> /* Unblock schedulers and restart their jobs. */ >>>>>>> for (q = 0; q < V3D_MAX_QUEUES; q++) { >>>>>>> - drm_sched_job_recovery(&v3d->queue[q].sched); >>>>>>> - kthread_unpark(v3d->queue[q].sched.thread); >>>>>>> + drm_sched_start(&v3d->queue[q].sched, true); >>>>>>> } >>>>>>> mutex_unlock(&v3d->reset_lock); >>>>>>> diff --git a/include/drm/gpu_scheduler.h >>>>>>> b/include/drm/gpu_scheduler.h >>>>>>> index 47e1979..5ab2d97 100644 >>>>>>> --- a/include/drm/gpu_scheduler.h >>>>>>> +++ b/include/drm/gpu_scheduler.h >>>>>>> @@ -175,6 +175,7 @@ struct drm_sched_fence >>>>>>> *to_drm_sched_fence(struct >>>>>>> dma_fence *f); >>>>>>> * finished to remove the job from the >>>>>>> * @drm_gpu_scheduler.ring_mirror_list. >>>>>>> * @node: used to append this struct to the >>>>>>> @drm_gpu_scheduler.ring_mirror_list. >>>>>>> + * @finish_node: used in a list to wait on before resetting the >>>>>>> scheduler >>>>>>> * @id: a unique id assigned to each job scheduled on the >>>>>>> scheduler. >>>>>>> * @karma: increment on every hang caused by this job. If >>>>>>> this >>>>>>> exceeds the hang >>>>>>> * limit of the scheduler then the job is marked >>>>>>> guilty and >>>>>>> will not >>>>>>> @@ -193,6 +194,7 @@ struct drm_sched_job { >>>>>>> struct dma_fence_cb finish_cb; >>>>>>> struct work_struct finish_work; >>>>>>> struct list_head node; >>>>>>> + struct list_head finish_node; >>>>>>> uint64_t id; >>>>>>> atomic_t karma; >>>>>>> enum drm_sched_priority s_priority; >>>>>>> @@ -298,9 +300,11 @@ int drm_sched_job_init(struct drm_sched_job >>>>>>> *job, >>>>>>> void *owner); >>>>>>> void drm_sched_job_cleanup(struct drm_sched_job *job); >>>>>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched); >>>>>>> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, >>>>>>> - struct drm_sched_job *job); >>>>>>> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched); >>>>>>> +void drm_sched_stop(struct drm_gpu_scheduler *sched, >>>>>>> + struct drm_sched_job *job); >>>>>>> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >>>>>>> full_recovery); >>>>>>> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); >>>>>>> +void drm_sched_increase_karma(struct drm_sched_job *bad); >>>>>>> bool drm_sched_dependency_optimized(struct dma_fence* fence, >>>>>>> struct drm_sched_entity *entity); >>>>>>> void drm_sched_fault(struct drm_gpu_scheduler *sched); >>>>>> _______________________________________________ >>>>>> amd-gfx mailing list >>>>>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >>>> _______________________________________________ >>>> amd-gfx mailing list >>>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >>> _______________________________________________ >>> dri-devel mailing list >>> dri-devel@xxxxxxxxxxxxxxxxxxxxx >>> https://lists.freedesktop.org/mailman/listinfo/dri-devel >> _______________________________________________ >> dri-devel mailing list >> dri-devel@xxxxxxxxxxxxxxxxxxxxx >> https://lists.freedesktop.org/mailman/listinfo/dri-devel > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx