[why] A gfx job may be processed but not finished when reset begin from compute job timeout. drm_sched_resubmit_jobs_ext in sched_main assume submitted job unsignaled and always put parent fence. Resubmission for that job cause underflow. This fix is done in device reset to avoid changing drm sched_main. [how] Check if the job to submit has signaled and avoid submission if signaled in device reset for both advanced TDR and normal job resume. Signed-off-by: Yiqing Yao <yiqing.yao@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 72 ++++++++++++---------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f16f105a737b..29b307af97eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4980,39 +4980,43 @@ static void amdgpu_device_recheck_guilty_jobs( /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get * to make sure fence is balanced */ dma_fence_get(s_job->s_fence->parent); - drm_sched_resubmit_jobs_ext(&ring->sched, 1); - ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); - if (ret == 0) { /* timeout */ - DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", - ring->sched.name, s_job->id); + /* avoid submission for signaled hw fence */ + if(!dma_fence_is_signaled(s_job->s_fence->parent)){ - /* set guilty */ - drm_sched_increase_karma(s_job); + drm_sched_resubmit_jobs_ext(&ring->sched, 1); + + ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); + if (ret == 0) { /* timeout */ + DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", + ring->sched.name, s_job->id); + + /* set guilty */ + drm_sched_increase_karma(s_job); retry: - /* do hw reset */ - if (amdgpu_sriov_vf(adev)) { - amdgpu_virt_fini_data_exchange(adev); - r = amdgpu_device_reset_sriov(adev, false); - if (r) - adev->asic_reset_res = r; - } else { - clear_bit(AMDGPU_SKIP_HW_RESET, - &reset_context->flags); - r = amdgpu_do_asic_reset(device_list_handle, - reset_context); - if (r && r == -EAGAIN) - goto retry; - } + /* do hw reset */ + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_fini_data_exchange(adev); + r = amdgpu_device_reset_sriov(adev, false); + if (r) + adev->asic_reset_res = r; + } else { + clear_bit(AMDGPU_SKIP_HW_RESET, + &reset_context->flags); + r = amdgpu_do_asic_reset(device_list_handle, + reset_context); + if (r && r == -EAGAIN) + goto retry; + } - /* - * add reset counter so that the following - * resubmitted job could flush vmid - */ - atomic_inc(&adev->gpu_reset_counter); - continue; + /* + * add reset counter so that the following + * resubmitted job could flush vmid + */ + atomic_inc(&adev->gpu_reset_counter); + continue; + } } - /* got the hw fence, signal finished fence */ atomic_dec(ring->sched.score); dma_fence_put(s_job->s_fence->parent); @@ -5221,13 +5225,19 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; - + struct drm_sched_job *s_job; if (!ring || !ring->sched.thread) continue; + s_job = list_first_entry_or_null(&ring->sched.pending_list, + struct drm_sched_job, list); + + if(s_job){ /* No point to resubmit jobs if we didn't HW reset*/ - if (!tmp_adev->asic_reset_res && !job_signaled) - drm_sched_resubmit_jobs(&ring->sched); + if (!tmp_adev->asic_reset_res && !job_signaled + && !dma_fence_is_signaled(s_job->s_fence->parent)) + drm_sched_resubmit_jobs(&ring->sched); + } drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); } -- 2.25.1