In order to get more accurate engine hang detection, pass the hang job to check_soft_reset to find the hang engine instead of check register status. Signed-off-by: Victor Zhao <Victor.Zhao@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +++++++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/dce_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 +- drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 2 +- drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 2 +- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- drivers/gpu/drm/amd/include/amd_shared.h | 2 +- 17 files changed, 27 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 22b846c359b4..4d6fd69f1bb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1359,7 +1359,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev); /* Common functions */ bool amdgpu_device_has_job_running(struct amdgpu_device *adev); -bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev); +bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev, struct amdgpu_job *job); int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job); void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 7a513e5f7156..4819aedbfe62 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -238,7 +238,7 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - if (amdgpu_device_should_recover_gpu(adev)) + if (amdgpu_device_should_recover_gpu(adev, NULL)) amdgpu_device_gpu_recover(adev, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a78a39e3d6e7..c82dc3621781 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4162,7 +4162,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) * if the asic is still hung or not. * Returns true if any of the IPs are still in a hung state, false if not. */ -static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) +static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev, struct amdgpu_job *job) { int i; bool asic_hang = false; @@ -4178,7 +4178,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) continue; if (adev->ip_blocks[i].version->funcs->check_soft_reset) adev->ip_blocks[i].status.hang = - adev->ip_blocks[i].version->funcs->check_soft_reset(adev); + adev->ip_blocks[i].version->funcs->check_soft_reset(adev, job); if (adev->ip_blocks[i].status.hang) { dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); asic_hang = true; @@ -4466,11 +4466,15 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover * a hung GPU. */ -bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) +bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev, struct amdgpu_job *job) { - if (!amdgpu_device_ip_check_soft_reset(adev)) { - dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); - return false; + if (!amdgpu_device_ip_check_soft_reset(adev, job)) { + if (job) { + dev_info(adev->dev, "No matching soft reset found, do whole gpu recovery\n"); + } else { + dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); + return false; + } } if (amdgpu_gpu_recovery == 0) @@ -4616,7 +4620,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_device_ip_pre_soft_reset(adev); r = amdgpu_device_ip_soft_reset(adev); amdgpu_device_ip_post_soft_reset(adev); - if (r || amdgpu_device_ip_check_soft_reset(adev)) { + if (r || amdgpu_device_ip_check_soft_reset(adev, NULL)) { dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); need_full_reset = true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e0730ea56a8c..dc9a2390cf39 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -62,7 +62,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", ti.process_name, ti.tgid, ti.task_name, ti.pid); - if (amdgpu_device_should_recover_gpu(ring->adev)) { + if (amdgpu_device_should_recover_gpu(ring->adev, job)) { amdgpu_device_gpu_recover(ring->adev, job); } else { drm_sched_suspend_timeout(&ring->sched); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b642dde8833b..f2aa65016d1c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1871,7 +1871,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_put_xgmi_hive(hive); } - if (amdgpu_device_should_recover_gpu(ras->adev)) + if (amdgpu_device_should_recover_gpu(ras->adev, NULL)) amdgpu_device_gpu_recover(ras->adev, NULL); atomic_set(&ras->in_recovery, 0); } diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c index 22f9fedadbb8..cc042db4c823 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c @@ -3033,7 +3033,7 @@ static int dce_v10_0_wait_for_idle(void *handle) return 0; } -static bool dce_v10_0_check_soft_reset(void *handle) +static bool dce_v10_0_check_soft_reset(void *handle, struct amdgpu_job *job) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 1a476de20d08..a55d4a7914bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4974,7 +4974,7 @@ static int gfx_v8_0_resume(void *handle) return gfx_v8_0_hw_init(handle); } -static bool gfx_v8_0_check_soft_reset(void *handle) +static bool gfx_v8_0_check_soft_reset(void *handle, struct amdgpu_job *job) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; u32 grbm_soft_reset = 0, srbm_soft_reset = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 9a3fc0926903..a3cbdb5703b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1311,7 +1311,7 @@ static int gmc_v8_0_wait_for_idle(void *handle) } -static bool gmc_v8_0_check_soft_reset(void *handle) +static bool gmc_v8_0_check_soft_reset(void *handle, struct amdgpu_job *job) { u32 srbm_soft_reset = 0; struct amdgpu_device *adev = (struct amdgpu_device *)handle; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f8a9b5e4f5e4..a3d5529b3642 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -274,7 +274,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) up_write(&adev->reset_sem); /* Trigger recovery for world switch failure if no TDR */ - if (amdgpu_device_should_recover_gpu(adev) + if (amdgpu_device_should_recover_gpu(adev, NULL) && (!amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) amdgpu_device_gpu_recover(adev, NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 1b0a1550a352..708f9aa3ab23 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -303,7 +303,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) up_write(&adev->reset_sem); /* Trigger recovery for world switch failure if no TDR */ - if (amdgpu_device_should_recover_gpu(adev) + if (amdgpu_device_should_recover_gpu(adev, NULL) && (!amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index aef9d059ae52..1321c70bbccc 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -520,7 +520,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) } /* Trigger recovery due to world switch failure */ - if (amdgpu_device_should_recover_gpu(adev)) + if (amdgpu_device_should_recover_gpu(adev, NULL)) amdgpu_device_gpu_recover(adev, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 135727b59c41..223f7411ea71 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -1250,7 +1250,7 @@ static int sdma_v3_0_wait_for_idle(void *handle) return -ETIMEDOUT; } -static bool sdma_v3_0_check_soft_reset(void *handle) +static bool sdma_v3_0_check_soft_reset(void *handle, struct amdgpu_job *job) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; u32 srbm_soft_reset = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c index b08905d1c00f..b222a003a9b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c @@ -380,7 +380,7 @@ static int tonga_ih_wait_for_idle(void *handle) return -ETIMEDOUT; } -static bool tonga_ih_check_soft_reset(void *handle) +static bool tonga_ih_check_soft_reset(void *handle, struct amdgpu_job *job) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; u32 srbm_soft_reset = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c index 2d558c2f417d..b2f5f15ea575 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c @@ -1159,7 +1159,7 @@ static int uvd_v6_0_wait_for_idle(void *handle) } #define AMDGPU_UVD_STATUS_BUSY_MASK 0xfd -static bool uvd_v6_0_check_soft_reset(void *handle) +static bool uvd_v6_0_check_soft_reset(void *handle, struct amdgpu_job *job) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; u32 srbm_soft_reset = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c index 142e291983b4..2e69ba269e5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c @@ -629,7 +629,7 @@ static int vce_v3_0_wait_for_idle(void *handle) #define AMDGPU_VCE_STATUS_BUSY_MASK (VCE_STATUS_VCPU_REPORT_AUTO_BUSY_MASK | \ VCE_STATUS_VCPU_REPORT_RB0_BUSY_MASK) -static bool vce_v3_0_check_soft_reset(void *handle) +static bool vce_v3_0_check_soft_reset(void *handle, struct amdgpu_job *job) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; u32 srbm_soft_reset = 0; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4a5523c3e75b..732d0d660c0f 100755 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -317,7 +317,7 @@ static int dm_wait_for_idle(void *handle) return 0; } -static bool dm_check_soft_reset(void *handle) +static bool dm_check_soft_reset(void *handle, struct amdgpu_job *job) { return false; } diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index f1a46d16f7ea..c202ebc2b231 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -287,7 +287,7 @@ struct amd_ip_funcs { int (*resume)(void *handle); bool (*is_idle)(void *handle); int (*wait_for_idle)(void *handle); - bool (*check_soft_reset)(void *handle); + bool (*check_soft_reset)(void *handle, struct amdgpu_job *job); int (*pre_soft_reset)(void *handle); int (*soft_reset)(void *handle); int (*post_soft_reset)(void *handle); -- 2.25.1