Just a ping. Andrey On 01/09/2019 10:18 AM, Andrey Grodzovsky wrote: > > > On 01/09/2019 05:22 AM, Christian König wrote: >> Am 07.01.19 um 20:47 schrieb Grodzovsky, Andrey: >>> >>> On 01/07/2019 09:13 AM, Christian König wrote: >>>> Am 03.01.19 um 18:42 schrieb Grodzovsky, Andrey: >>>>> On 01/03/2019 11:20 AM, Grodzovsky, Andrey wrote: >>>>>> On 01/03/2019 03:54 AM, Koenig, Christian wrote: >>>>>>> Am 21.12.18 um 21:36 schrieb Grodzovsky, Andrey: >>>>>>>> On 12/21/2018 01:37 PM, Christian König wrote: >>>>>>>>> Am 20.12.18 um 20:23 schrieb Andrey Grodzovsky: >>>>>>>>>> Decauple sched threads stop and start and ring mirror >>>>>>>>>> list handling from the policy of what to do about the >>>>>>>>>> guilty jobs. >>>>>>>>>> When stoppping the sched thread and detaching sched fences >>>>>>>>>> from non signaled HW fenes wait for all signaled HW fences >>>>>>>>>> to complete before rerunning the jobs. >>>>>>>>>> >>>>>>>>>> v2: Fix resubmission of guilty job into HW after refactoring. >>>>>>>>>> >>>>>>>>>> v4: >>>>>>>>>> Full restart for all the jobs, not only from guilty ring. >>>>>>>>>> Extract karma increase into standalone function. >>>>>>>>>> >>>>>>>>>> v5: >>>>>>>>>> Rework waiting for signaled jobs without relying on the job >>>>>>>>>> struct itself as those might already be freed for non 'guilty' >>>>>>>>>> job's schedulers. >>>>>>>>>> Expose karma increase to drivers. >>>>>>>>>> >>>>>>>>>> Suggested-by: Christian Koenig <Christian.Koenig@xxxxxxx> >>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >>>>>>>>>> --- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +-- >>>>>>>>>> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 11 +- >>>>>>>>>> drivers/gpu/drm/scheduler/sched_main.c | 188 >>>>>>>>>> +++++++++++++++++++---------- >>>>>>>>>> drivers/gpu/drm/v3d/v3d_sched.c | 12 +- >>>>>>>>>> include/drm/gpu_scheduler.h | 10 +- >>>>>>>>>> 5 files changed, 151 insertions(+), 88 deletions(-) >>>>>>>>>> >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> index 8a078f4..a4bd2d3 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> @@ -3298,12 +3298,10 @@ static int >>>>>>>>>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, >>>>>>>>>> if (!ring || !ring->sched.thread) >>>>>>>>>> continue; >>>>>>>>>> - kthread_park(ring->sched.thread); >>>>>>>>>> + drm_sched_stop(&ring->sched, job ? &job->base : NULL); >>>>>>>>>> - if (job && job->base.sched != &ring->sched) >>>>>>>>>> - continue; >>>>>>>>>> - >>>>>>>>>> - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : >>>>>>>>>> NULL); >>>>>>>>>> + if(job) >>>>>>>>>> + drm_sched_increase_karma(&job->base); >>>>>>>>> Since we dropped the "job && job->base.sched != &ring->sched" >>>>>>>>> check >>>>>>>>> above this will now increase the jobs karma multiple times. >>>>>>>>> >>>>>>>>> Maybe just move that outside of the loop. >>>>>>>>> >>>>>>>>>> /* after all hw jobs are reset, hw fence is >>>>>>>>>> meaningless, >>>>>>>>>> so force_completion */ >>>>>>>>>> amdgpu_fence_driver_force_completion(ring); >>>>>>>>>> @@ -3454,14 +3452,10 @@ static void >>>>>>>>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >>>>>>>>>> if (!ring || !ring->sched.thread) >>>>>>>>>> continue; >>>>>>>>>> - /* only need recovery sched of the given >>>>>>>>>> job's ring >>>>>>>>>> - * or all rings (in the case @job is NULL) >>>>>>>>>> - * after above amdgpu_reset accomplished >>>>>>>>>> - */ >>>>>>>>>> - if ((!job || job->base.sched == &ring->sched) && >>>>>>>>>> !adev->asic_reset_res) >>>>>>>>>> - drm_sched_job_recovery(&ring->sched); >>>>>>>>>> + if (!adev->asic_reset_res) >>>>>>>>>> + drm_sched_resubmit_jobs(&ring->sched); >>>>>>>>>> - kthread_unpark(ring->sched.thread); >>>>>>>>>> + drm_sched_start(&ring->sched, !adev->asic_reset_res); >>>>>>>>>> } >>>>>>>>>> if (!amdgpu_device_has_dc_support(adev)) { >>>>>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>>>>> index 49a6763..6f1268f 100644 >>>>>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>>>>>>>>> @@ -109,16 +109,19 @@ static void >>>>>>>>>> etnaviv_sched_timedout_job(struct >>>>>>>>>> drm_sched_job *sched_job) >>>>>>>>>> } >>>>>>>>>> /* block scheduler */ >>>>>>>>>> - kthread_park(gpu->sched.thread); >>>>>>>>>> - drm_sched_hw_job_reset(&gpu->sched, sched_job); >>>>>>>>>> + drm_sched_stop(&gpu->sched, sched_job); >>>>>>>>>> + >>>>>>>>>> + if(sched_job) >>>>>>>>>> + drm_sched_increase_karma(sched_job); >>>>>>>>>> /* get the GPU back into the init state */ >>>>>>>>>> etnaviv_core_dump(gpu); >>>>>>>>>> etnaviv_gpu_recover_hang(gpu); >>>>>>>>>> + drm_sched_resubmit_jobs(&gpu->sched); >>>>>>>>>> + >>>>>>>>>> /* restart scheduler after GPU is usable again */ >>>>>>>>>> - drm_sched_job_recovery(&gpu->sched); >>>>>>>>>> - kthread_unpark(gpu->sched.thread); >>>>>>>>>> + drm_sched_start(&gpu->sched, true); >>>>>>>>>> } >>>>>>>>>> static void etnaviv_sched_free_job(struct drm_sched_job >>>>>>>>>> *sched_job) >>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c >>>>>>>>>> index dbb6906..b5c5bee 100644 >>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c >>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c >>>>>>>>>> @@ -60,8 +60,6 @@ >>>>>>>>>> static void drm_sched_process_job(struct dma_fence *f, >>>>>>>>>> struct >>>>>>>>>> dma_fence_cb *cb); >>>>>>>>>> -static void drm_sched_expel_job_unlocked(struct >>>>>>>>>> drm_sched_job >>>>>>>>>> *s_job); >>>>>>>>>> - >>>>>>>>>> /** >>>>>>>>>> * drm_sched_rq_init - initialize a given run queue >>>>>>>>>> struct >>>>>>>>>> * >>>>>>>>>> @@ -335,6 +333,42 @@ static void drm_sched_job_timedout(struct >>>>>>>>>> work_struct *work) >>>>>>>>>> spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>>>>> } >>>>>>>>> Kernel doc here would be nice to have. >>>>>>>>> >>>>>>>>>> +void drm_sched_increase_karma(struct drm_sched_job *bad) >>>>>>>>>> +{ >>>>>>>>>> + int i; >>>>>>>>>> + struct drm_sched_entity *tmp; >>>>>>>>>> + struct drm_sched_entity *entity; >>>>>>>>>> + struct drm_gpu_scheduler *sched = bad->sched; >>>>>>>>>> + >>>>>>>>>> + /* don't increase @bad's karma if it's from KERNEL RQ, >>>>>>>>>> + * because sometimes GPU hang would cause kernel jobs >>>>>>>>>> (like VM >>>>>>>>>> updating jobs) >>>>>>>>>> + * corrupt but keep in mind that kernel jobs always >>>>>>>>>> considered >>>>>>>>>> good. >>>>>>>>>> + */ >>>>>>>>>> + if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { >>>>>>>>>> + atomic_inc(&bad->karma); >>>>>>>>>> + for (i = DRM_SCHED_PRIORITY_MIN; i < >>>>>>>>>> DRM_SCHED_PRIORITY_KERNEL; >>>>>>>>>> + i++) { >>>>>>>>>> + struct drm_sched_rq *rq = &sched->sched_rq[i]; >>>>>>>>>> + >>>>>>>>>> + spin_lock(&rq->lock); >>>>>>>>>> + list_for_each_entry_safe(entity, tmp, >>>>>>>>>> &rq->entities, >>>>>>>>>> list) { >>>>>>>>>> + if (bad->s_fence->scheduled.context == >>>>>>>>>> + entity->fence_context) { >>>>>>>>>> + if (atomic_read(&bad->karma) > >>>>>>>>>> + bad->sched->hang_limit) >>>>>>>>>> + if (entity->guilty) >>>>>>>>>> + atomic_set(entity->guilty, 1); >>>>>>>>>> + break; >>>>>>>>>> + } >>>>>>>>>> + } >>>>>>>>>> + spin_unlock(&rq->lock); >>>>>>>>>> + if (&entity->list != &rq->entities) >>>>>>>>>> + break; >>>>>>>>>> + } >>>>>>>>>> + } >>>>>>>>>> +} >>>>>>>>>> +EXPORT_SYMBOL(drm_sched_increase_karma); >>>>>>>>>> + >>>>>>>>>> /** >>>>>>>>>> * drm_sched_hw_job_reset - stop the scheduler if it >>>>>>>>>> contains the >>>>>>>>>> bad job >>>>>>>>>> * >>>>>>>>>> @@ -342,13 +376,22 @@ static void drm_sched_job_timedout(struct >>>>>>>>>> work_struct *work) >>>>>>>>>> * @bad: bad scheduler job >>>>>>>>>> * >>>>>>>>>> */ >>>>>>>>>> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, >>>>>>>>>> struct >>>>>>>>>> drm_sched_job *bad) >>>>>>>>>> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct >>>>>>>>>> drm_sched_job *bad) >>>>>>>>>> { >>>>>>>>>> - struct drm_sched_job *s_job; >>>>>>>>>> - struct drm_sched_entity *entity, *tmp; >>>>>>>>>> + struct drm_sched_job *s_job, *last_job; >>>>>>>>>> unsigned long flags; >>>>>>>>>> - int i; >>>>>>>>>> + struct dma_fence *wait_fence = NULL; >>>>>>>>>> + int r; >>>>>>>>>> + >>>>>>>>>> + kthread_park(sched->thread); >>>>>>>>>> + /* >>>>>>>>>> + * Verify all the signaled jobs in mirror list are removed >>>>>>>>>> from >>>>>>>>>> the ring >>>>>>>>>> + * by waiting for their respective scheduler fences to >>>>>>>>>> signal. >>>>>>>>>> + * Continually repeat traversing the ring mirror list >>>>>>>>>> until no >>>>>>>>>> more signaled >>>>>>>>>> + * fences are found >>>>>>>>>> + */ >>>>>>>>>> +retry_wait: >>>>>>>>>> spin_lock_irqsave(&sched->job_list_lock, flags); >>>>>>>>>> list_for_each_entry_reverse(s_job, >>>>>>>>>> &sched->ring_mirror_list, >>>>>>>>>> node) { >>>>>>>>>> if (s_job->s_fence->parent && >>>>>>>>>> @@ -357,35 +400,45 @@ void drm_sched_hw_job_reset(struct >>>>>>>>>> drm_gpu_scheduler *sched, struct drm_sched_jo >>>>>>>>>> dma_fence_put(s_job->s_fence->parent); >>>>>>>>>> s_job->s_fence->parent = NULL; >>>>>>>>>> atomic_dec(&sched->hw_rq_count); >>>>>>>>>> + } else { >>>>>>>>>> + wait_fence = >>>>>>>>>> dma_fence_get(&s_job->s_fence->finished); >>>>>>>>>> + last_job = s_job; >>>>>>>>>> + break; >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>>>>> - if (bad && bad->s_priority != >>>>>>>>>> DRM_SCHED_PRIORITY_KERNEL) { >>>>>>>>>> - atomic_inc(&bad->karma); >>>>>>>>>> - /* don't increase @bad's karma if it's from KERNEL RQ, >>>>>>>>>> - * becuase sometimes GPU hang would cause kernel jobs >>>>>>>>>> (like >>>>>>>>>> VM updating jobs) >>>>>>>>>> - * corrupt but keep in mind that kernel jobs always >>>>>>>>>> considered good. >>>>>>>>>> - */ >>>>>>>>>> - for (i = DRM_SCHED_PRIORITY_MIN; i < >>>>>>>>>> DRM_SCHED_PRIORITY_KERNEL; i++ ) { >>>>>>>>>> - struct drm_sched_rq *rq = &sched->sched_rq[i]; >>>>>>>>>> + /* No signaled jobs in the ring, its safe to proceed to >>>>>>>>>> ASIC >>>>>>>>>> reset */ >>>>>>>>>> + if (!wait_fence) { >>>>>>>>>> + spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>>>>> + goto done; >>>>>>>>>> + } >>>>>>>>>> - spin_lock(&rq->lock); >>>>>>>>>> - list_for_each_entry_safe(entity, tmp, >>>>>>>>>> &rq->entities, >>>>>>>>>> list) { >>>>>>>>>> - if (bad->s_fence->scheduled.context == >>>>>>>>>> entity->fence_context) { >>>>>>>>>> - if (atomic_read(&bad->karma) > >>>>>>>>>> bad->sched->hang_limit) >>>>>>>>>> - if (entity->guilty) >>>>>>>>>> - atomic_set(entity->guilty, 1); >>>>>>>>>> - break; >>>>>>>>>> - } >>>>>>>>>> - } >>>>>>>>>> - spin_unlock(&rq->lock); >>>>>>>>>> - if (&entity->list != &rq->entities) >>>>>>>>>> - break; >>>>>>>>>> + /* Restore removed cb since removing again already removed >>>>>>>>>> cb is >>>>>>>>>> undefined */ >>>>>>>>>> + list_for_each_entry_reverse(s_job, >>>>>>>>>> &sched->ring_mirror_list, >>>>>>>>>> node) { >>>>>>>>>> + if(s_job == last_job) >>>>>>>>>> + break; >>>>>>>>> Need to double check after the holidays, but you should be able >>>>>>>>> to use >>>>>>>>> list_for_each_entry_continue here. >>>>>>>> I think it should work - kind of traversing back on all the jobs >>>>>>>> we just >>>>>>>> removed their callbacks. >>>>>>> Wrapping your head around stuff again after the holidays sometimes >>>>>>> shows >>>>>>> new problems we have completely missed :) >>>>>>> >>>>>>> Adding the callbacks again won't work because we have freed up our >>>>>>> reference to the hardware fence above: >>>>>>>> dma_fence_put(s_job->s_fence->parent); >>>>>>>> s_job->s_fence->parent = NULL; >>>>>>> We need to drop this or we would never be able to re-add the fences >>>>>> Yea, that a big miss on my side... >>>>>> >>>>>>> . >>>>>>> >>>>>>> But I'm still not sure if we shouldn't rely on the correct order of >>>>>>> signaling and simplify this instead. >>>>>> As you said before, once we switched to signaling the parent from >>>>>> the >>>>>> interrupt context instead of scheduled work no danger of race there >>>>> Correction here - once we switched removing the job from mirror_ring >>>>> list directly in interrupt context instead later from scheduled work >>>> Ok, so let's stick with the approach of only waiting for the first >>>> signaled one found. >>>> >>>> But we need to remove setting the parent fence to NULL or otherwise we >>>> won't be able to add the callback ever again. >>>> >>>> Christian. >>> But we will not be adding the cb back in drm_sched_stop anymore, now we >>> are only going to add back the cb in drm_sched_startr after rerunning >>> those jobs in drm_sched_resubmit_jobs and assign them a new parent >>> there >>> anyway. >> >> Yeah, but when we find that we don't need to reset anything anymore >> then adding the callbacks again won't be possible any more. >> >> Christian. > > I am not sure I understand it, can u point me to example of how this > will happen ? I am attaching my latest patches with waiting only for > the last job's fence here just so we are on same page regarding the code. > > Andrey > >> >>> >>> Andrey >>> >>>>> Andrey >>>>> >>>>>> , so >>>>>> what if we submit job A and after it job B and B completes before A >>>>>> (like the sync dependency test in libdrm amdgpu tests but without >>>>>> adding >>>>>> explicit dependency to the second command on the first) I believe >>>>>> that >>>>>> still in this case job B's parent (HW) fence will not be signaled >>>>>> before >>>>>> job A completes since EOP event is not signaled until the entire >>>>>> pipe >>>>>> completed and flushed it's cashes including job A. So from this >>>>>> seems to >>>>>> me that indeed it's enough to wait for the last inserted job's >>>>>> parent >>>>>> (HW) fence in ring mirror list to signal. >>>>>> Let me know what you think on that. >>>>>> >>>>>> P.S V5 is not the last iteration and there was V6 series. >>>>>> >>>>>> Andrey >>>>>> >>>>>>> Regards, >>>>>>> Christian. >>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>>>> + >>>>>>>>>> + if (s_job->s_fence->parent) { >>>>>>>>>> + r = dma_fence_add_callback(s_job->s_fence->parent, >>>>>>>>>> + &s_job->s_fence->cb, >>>>>>>>>> + drm_sched_process_job); >>>>>>>>>> + if (r) >>>>>>>>>> + DRM_ERROR("fence restore callback failed >>>>>>>>>> (%d)\n", >>>>>>>>>> + r); >>>>>>>>> When you fail to add the callback this means that you need to >>>>>>>>> call >>>>>>>>> call drm_sched_process_job manually here. >>>>>>>>> >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> + spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>>>>> + >>>>>>>>>> + dma_fence_wait(wait_fence, false); >>>>>>>>>> + dma_fence_put(wait_fence); >>>>>>>>>> + wait_fence = NULL; >>>>>>>>>> + >>>>>>>>>> + goto retry_wait; >>>>>>>>>> + >>>>>>>>>> +done: >>>>>>>>>> + return; >>>>>>>>> Drop the done: label and return directly above. >>>>>>>>> >>>>>>>>> Apart from all those nit picks that starts to look like it should >>>>>>>>> work, >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>> } >>>>>>>>>> -EXPORT_SYMBOL(drm_sched_hw_job_reset); >>>>>>>>>> +EXPORT_SYMBOL(drm_sched_stop); >>>>>>>>>> /** >>>>>>>>>> * drm_sched_job_recovery - recover jobs after a reset >>>>>>>>>> @@ -393,33 +446,21 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset); >>>>>>>>>> * @sched: scheduler instance >>>>>>>>>> * >>>>>>>>>> */ >>>>>>>>>> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) >>>>>>>>>> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >>>>>>>>>> full_recovery) >>>>>>>>>> { >>>>>>>>>> struct drm_sched_job *s_job, *tmp; >>>>>>>>>> - bool found_guilty = false; >>>>>>>>>> unsigned long flags; >>>>>>>>>> int r; >>>>>>>>>> + if (!full_recovery) >>>>>>>>>> + goto unpark; >>>>>>>>>> + >>>>>>>>>> spin_lock_irqsave(&sched->job_list_lock, flags); >>>>>>>>>> list_for_each_entry_safe(s_job, tmp, >>>>>>>>>> &sched->ring_mirror_list, >>>>>>>>>> node) { >>>>>>>>>> struct drm_sched_fence *s_fence = s_job->s_fence; >>>>>>>>>> - struct dma_fence *fence; >>>>>>>>>> - uint64_t guilty_context; >>>>>>>>>> - >>>>>>>>>> - if (!found_guilty && atomic_read(&s_job->karma) > >>>>>>>>>> sched->hang_limit) { >>>>>>>>>> - found_guilty = true; >>>>>>>>>> - guilty_context = s_job->s_fence->scheduled.context; >>>>>>>>>> - } >>>>>>>>>> - >>>>>>>>>> - if (found_guilty && >>>>>>>>>> s_job->s_fence->scheduled.context == >>>>>>>>>> guilty_context) >>>>>>>>>> - dma_fence_set_error(&s_fence->finished, -ECANCELED); >>>>>>>>>> - >>>>>>>>>> - spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>>>>> - fence = sched->ops->run_job(s_job); >>>>>>>>>> - atomic_inc(&sched->hw_rq_count); >>>>>>>>>> + struct dma_fence *fence = s_job->s_fence->parent; >>>>>>>>>> if (fence) { >>>>>>>>>> - s_fence->parent = dma_fence_get(fence); >>>>>>>>>> r = dma_fence_add_callback(fence, >>>>>>>>>> &s_fence->cb, >>>>>>>>>> drm_sched_process_job); >>>>>>>>>> if (r == -ENOENT) >>>>>>>>>> @@ -427,18 +468,47 @@ void drm_sched_job_recovery(struct >>>>>>>>>> drm_gpu_scheduler *sched) >>>>>>>>>> else if (r) >>>>>>>>>> DRM_ERROR("fence add callback failed >>>>>>>>>> (%d)\n", >>>>>>>>>> r); >>>>>>>>>> - dma_fence_put(fence); >>>>>>>>>> - } else { >>>>>>>>>> - if (s_fence->finished.error < 0) >>>>>>>>>> - drm_sched_expel_job_unlocked(s_job); >>>>>>>>>> + } else >>>>>>>>>> drm_sched_process_job(NULL, &s_fence->cb); >>>>>>>>>> - } >>>>>>>>>> - spin_lock_irqsave(&sched->job_list_lock, flags); >>>>>>>>>> } >>>>>>>>>> + >>>>>>>>>> drm_sched_start_timeout(sched); >>>>>>>>>> spin_unlock_irqrestore(&sched->job_list_lock, flags); >>>>>>>>>> + >>>>>>>>>> +unpark: >>>>>>>>>> + kthread_unpark(sched->thread); >>>>>>>>>> +} >>>>>>>>>> +EXPORT_SYMBOL(drm_sched_start); >>>>>>>>>> + >>>>>>>>>> +/** >>>>>>>>>> + * drm_sched_resubmit_jobs - helper to relunch job from mirror >>>>>>>>>> ring >>>>>>>>>> list >>>>>>>>>> + * >>>>>>>>>> + * @sched: scheduler instance >>>>>>>>>> + * >>>>>>>>>> + */ >>>>>>>>>> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) >>>>>>>>>> +{ >>>>>>>>>> + struct drm_sched_job *s_job, *tmp; >>>>>>>>>> + uint64_t guilty_context; >>>>>>>>>> + bool found_guilty = false; >>>>>>>>>> + >>>>>>>>>> + /*TODO DO we need spinlock here ? */ >>>>>>>>>> + list_for_each_entry_safe(s_job, tmp, >>>>>>>>>> &sched->ring_mirror_list, >>>>>>>>>> node) { >>>>>>>>>> + struct drm_sched_fence *s_fence = s_job->s_fence; >>>>>>>>>> + >>>>>>>>>> + if (!found_guilty && atomic_read(&s_job->karma) > >>>>>>>>>> sched->hang_limit) { >>>>>>>>>> + found_guilty = true; >>>>>>>>>> + guilty_context = s_job->s_fence->scheduled.context; >>>>>>>>>> + } >>>>>>>>>> + >>>>>>>>>> + if (found_guilty && >>>>>>>>>> s_job->s_fence->scheduled.context == >>>>>>>>>> guilty_context) >>>>>>>>>> + dma_fence_set_error(&s_fence->finished, -ECANCELED); >>>>>>>>>> + >>>>>>>>>> + s_job->s_fence->parent = sched->ops->run_job(s_job); >>>>>>>>>> + atomic_inc(&sched->hw_rq_count); >>>>>>>>>> + } >>>>>>>>>> } >>>>>>>>>> -EXPORT_SYMBOL(drm_sched_job_recovery); >>>>>>>>>> +EXPORT_SYMBOL(drm_sched_resubmit_jobs); >>>>>>>>>> /** >>>>>>>>>> * drm_sched_job_init - init a scheduler job >>>>>>>>>> @@ -634,26 +704,14 @@ static int drm_sched_main(void *param) >>>>>>>>>> DRM_ERROR("fence add callback failed >>>>>>>>>> (%d)\n", >>>>>>>>>> r); >>>>>>>>>> dma_fence_put(fence); >>>>>>>>>> - } else { >>>>>>>>>> - if (s_fence->finished.error < 0) >>>>>>>>>> - drm_sched_expel_job_unlocked(sched_job); >>>>>>>>>> + } else >>>>>>>>>> drm_sched_process_job(NULL, &s_fence->cb); >>>>>>>>>> - } >>>>>>>>>> wake_up(&sched->job_scheduled); >>>>>>>>>> } >>>>>>>>>> return 0; >>>>>>>>>> } >>>>>>>>>> -static void drm_sched_expel_job_unlocked(struct >>>>>>>>>> drm_sched_job *s_job) >>>>>>>>>> -{ >>>>>>>>>> - struct drm_gpu_scheduler *sched = s_job->sched; >>>>>>>>>> - >>>>>>>>>> - spin_lock(&sched->job_list_lock); >>>>>>>>>> - list_del_init(&s_job->node); >>>>>>>>>> - spin_unlock(&sched->job_list_lock); >>>>>>>>>> -} >>>>>>>>>> - >>>>>>>>>> /** >>>>>>>>>> * drm_sched_init - Init a gpu scheduler instance >>>>>>>>>> * >>>>>>>>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>>>>> index 445b2ef..f76d9ed 100644 >>>>>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c >>>>>>>>>> @@ -178,18 +178,22 @@ v3d_job_timedout(struct drm_sched_job >>>>>>>>>> *sched_job) >>>>>>>>>> for (q = 0; q < V3D_MAX_QUEUES; q++) { >>>>>>>>>> struct drm_gpu_scheduler *sched = >>>>>>>>>> &v3d->queue[q].sched; >>>>>>>>>> - kthread_park(sched->thread); >>>>>>>>>> - drm_sched_hw_job_reset(sched, (sched_job->sched == >>>>>>>>>> sched ? >>>>>>>>>> + drm_sched_stop(sched, (sched_job->sched == sched ? >>>>>>>>>> sched_job : NULL)); >>>>>>>>>> + >>>>>>>>>> + if(sched_job) >>>>>>>>>> + drm_sched_increase_karma(sched_job); >>>>>>>>>> } >>>>>>>>>> /* get the GPU back into the init state */ >>>>>>>>>> v3d_reset(v3d); >>>>>>>>>> + for (q = 0; q < V3D_MAX_QUEUES; q++) >>>>>>>>>> + drm_sched_resubmit_jobs(sched_job->sched); >>>>>>>>>> + >>>>>>>>>> /* Unblock schedulers and restart their jobs. */ >>>>>>>>>> for (q = 0; q < V3D_MAX_QUEUES; q++) { >>>>>>>>>> - drm_sched_job_recovery(&v3d->queue[q].sched); >>>>>>>>>> - kthread_unpark(v3d->queue[q].sched.thread); >>>>>>>>>> + drm_sched_start(&v3d->queue[q].sched, true); >>>>>>>>>> } >>>>>>>>>> mutex_unlock(&v3d->reset_lock); >>>>>>>>>> diff --git a/include/drm/gpu_scheduler.h >>>>>>>>>> b/include/drm/gpu_scheduler.h >>>>>>>>>> index 47e1979..5ab2d97 100644 >>>>>>>>>> --- a/include/drm/gpu_scheduler.h >>>>>>>>>> +++ b/include/drm/gpu_scheduler.h >>>>>>>>>> @@ -175,6 +175,7 @@ struct drm_sched_fence >>>>>>>>>> *to_drm_sched_fence(struct >>>>>>>>>> dma_fence *f); >>>>>>>>>> * finished to remove the job from the >>>>>>>>>> * @drm_gpu_scheduler.ring_mirror_list. >>>>>>>>>> * @node: used to append this struct to the >>>>>>>>>> @drm_gpu_scheduler.ring_mirror_list. >>>>>>>>>> + * @finish_node: used in a list to wait on before resetting the >>>>>>>>>> scheduler >>>>>>>>>> * @id: a unique id assigned to each job scheduled on the >>>>>>>>>> scheduler. >>>>>>>>>> * @karma: increment on every hang caused by this job. If >>>>>>>>>> this >>>>>>>>>> exceeds the hang >>>>>>>>>> * limit of the scheduler then the job is marked >>>>>>>>>> guilty and >>>>>>>>>> will not >>>>>>>>>> @@ -193,6 +194,7 @@ struct drm_sched_job { >>>>>>>>>> struct dma_fence_cb finish_cb; >>>>>>>>>> struct work_struct finish_work; >>>>>>>>>> struct list_head node; >>>>>>>>>> + struct list_head finish_node; >>>>>>>>>> uint64_t id; >>>>>>>>>> atomic_t karma; >>>>>>>>>> enum drm_sched_priority s_priority; >>>>>>>>>> @@ -298,9 +300,11 @@ int drm_sched_job_init(struct drm_sched_job >>>>>>>>>> *job, >>>>>>>>>> void *owner); >>>>>>>>>> void drm_sched_job_cleanup(struct drm_sched_job *job); >>>>>>>>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched); >>>>>>>>>> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, >>>>>>>>>> - struct drm_sched_job *job); >>>>>>>>>> -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched); >>>>>>>>>> +void drm_sched_stop(struct drm_gpu_scheduler *sched, >>>>>>>>>> + struct drm_sched_job *job); >>>>>>>>>> +void drm_sched_start(struct drm_gpu_scheduler *sched, bool >>>>>>>>>> full_recovery); >>>>>>>>>> +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); >>>>>>>>>> +void drm_sched_increase_karma(struct drm_sched_job *bad); >>>>>>>>>> bool drm_sched_dependency_optimized(struct dma_fence* >>>>>>>>>> fence, >>>>>>>>>> struct drm_sched_entity *entity); >>>>>>>>>> void drm_sched_fault(struct drm_gpu_scheduler *sched); >>>>>>>>> _______________________________________________ >>>>>>>>> amd-gfx mailing list >>>>>>>>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >>>>>>> _______________________________________________ >>>>>>> amd-gfx mailing list >>>>>>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >>>>>> _______________________________________________ >>>>>> dri-devel mailing list >>>>>> dri-devel@xxxxxxxxxxxxxxxxxxxxx >>>>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel >>>>> _______________________________________________ >>>>> dri-devel mailing list >>>>> dri-devel@xxxxxxxxxxxxxxxxxxxxx >>>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel >>> _______________________________________________ >>> amd-gfx mailing list >>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >> > _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel