Check gpu status first, if MC/VMC/DISPLAY hang, directly triger full reset. If engine hangs, then triger engine soft reset, if soft reset fails, will fallback to full reset. Change-Id: I6f946db3624cd950e11e669f5dc80be58dad4711 Signed-off-by: Chunming Zhou <David1.Zhou at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 ++++++++++++++++++++++++----- drivers/gpu/drm/amd/include/amd_shared.h | 2 + 2 files changed, 87 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c5b01df..3dd1467 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1975,7 +1975,8 @@ int amdgpu_pre_soft_reset(struct amdgpu_device *adev) for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_block_status[i].valid) continue; - if (adev->ip_blocks[i].funcs->pre_soft_reset) { + if (adev->ip_block_status[i].hang && + adev->ip_blocks[i].funcs->pre_soft_reset) { r = adev->ip_blocks[i].funcs->pre_soft_reset(adev); if (r) return r; @@ -1985,6 +1986,58 @@ int amdgpu_pre_soft_reset(struct amdgpu_device *adev) return 0; } +static bool amdgpu_need_full_reset(struct amdgpu_device *adev) +{ + if (adev->ip_block_status[AMD_IP_BLOCK_TYPE_GMC].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_IH].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_SMC].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_GFX].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_SDMA].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_UVD].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_VCE].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_ACP].hang || + adev->ip_block_status[AMD_IP_BLOCK_TYPE_DCE].hang) { + DRM_INFO("Some block need full reset!\n"); + return true; + } + return false; +} + +static int amdgpu_soft_reset(struct amdgpu_device *adev) +{ + int i, r = 0; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_block_status[i].valid) + continue; + if (adev->ip_block_status[i].hang && + adev->ip_blocks[i].funcs->soft_reset) { + r = adev->ip_blocks[i].funcs->soft_reset(adev); + if (r) + return r; + } + } + + return 0; +} + +static int amdgpu_post_soft_reset(struct amdgpu_device *adev) +{ + int i, r = 0; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_block_status[i].valid) + continue; + if (adev->ip_block_status[i].hang && + adev->ip_blocks[i].funcs->post_soft_reset) + r = adev->ip_blocks[i].funcs->post_soft_reset(adev); + if (r) + return r; + } + + return 0; +} + /** * amdgpu_gpu_reset - reset the asic * @@ -1998,6 +2051,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) struct drm_atomic_state *state = NULL; int i, r; int resched; + bool need_full_reset; if (!amdgpu_check_soft_reset(adev)) { DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); @@ -2024,27 +2078,41 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) if (amdgpu_device_has_dal_support(adev)) state = drm_atomic_helper_suspend(adev->ddev); - /* save scratch */ - amdgpu_atombios_scratch_regs_save(adev); - r = amdgpu_suspend(adev); + need_full_reset = amdgpu_need_full_reset(adev); -retry: - /* Disable fb access */ - if (adev->mode_info.num_crtc) { - struct amdgpu_mode_mc_save save; - amdgpu_display_stop_mc_access(adev, &save); + if (!need_full_reset) { + amdgpu_pre_soft_reset(adev); + r = amdgpu_soft_reset(adev); + amdgpu_post_soft_reset(adev); + if (r || amdgpu_check_soft_reset(adev)) { + DRM_INFO("soft reset failed, will fallback to full reset!\n"); + need_full_reset = true; + } } - r = amdgpu_asic_reset(adev); - /* post card */ - amdgpu_atom_asic_init(adev->mode_info.atom_context); + if (need_full_reset) { + /* save scratch */ + amdgpu_atombios_scratch_regs_save(adev); + r = amdgpu_suspend(adev); - if (!r) { - dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); - r = amdgpu_resume(adev); +retry: + /* Disable fb access */ + if (adev->mode_info.num_crtc) { + struct amdgpu_mode_mc_save save; + amdgpu_display_stop_mc_access(adev, &save); + } + + r = amdgpu_asic_reset(adev); + /* post card */ + amdgpu_atom_asic_init(adev->mode_info.atom_context); + + if (!r) { + dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); + r = amdgpu_resume(adev); + } + /* restore scratch */ + amdgpu_atombios_scratch_regs_restore(adev); } - /* restore scratch */ - amdgpu_atombios_scratch_regs_restore(adev); if (!r) { r = amdgpu_ib_ring_tests(adev); if (r) { diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index 7104085..d938c2a 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -174,6 +174,8 @@ struct amd_ip_funcs { int (*pre_soft_reset)(void *handle); /* soft reset the IP block */ int (*soft_reset)(void *handle); + /* post soft reset the IP block */ + int (*post_soft_reset)(void *handle); /* enable/disable cg for the IP block */ int (*set_clockgating_state)(void *handle, enum amd_clockgating_state state); -- 1.9.1