When gfx ras poison consumption causes gpu reset on gfx v11_0_3, the sequence of gpu reset is "soft reset -> mode2 reset -> mode1 reset". If the previous reset fails, fall back to the next reset. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 ++++++++++++++++------ 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a5086be4d7dd..c8d2a281098f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4770,13 +4770,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (job && job->vm) drm_sched_increase_karma(&job->base); - r = amdgpu_reset_prepare_hwcontext(adev, reset_context); - /* If reset handler not implemented, continue; otherwise return */ - if (r == -ENOSYS) - r = 0; - else - return r; - /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ if (!amdgpu_sriov_vf(adev)) { @@ -4789,12 +4782,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, r = amdgpu_device_ip_soft_reset(adev); amdgpu_device_ip_post_soft_reset(adev); if (r || amdgpu_device_ip_check_soft_reset(adev)) { - dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (ras->reset_by_gfx_poison) { + reset_context->method = AMD_RESET_METHOD_MODE2; + dev_info(adev->dev, "soft reset failed, will fallback to mode2 reset!\n"); + } else { + dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); + } need_full_reset = true; } } - if (need_full_reset) + /* IP suspend will affect mode2 reset, so ip suspend is skipped + * when mode2 reset is enabled. + */ + if (need_full_reset && + (reset_context->method != AMD_RESET_METHOD_MODE2)) r = amdgpu_device_ip_suspend(adev); if (need_full_reset) set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); @@ -4803,6 +4807,11 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, &reset_context->flags); } + r = amdgpu_reset_prepare_hwcontext(adev, reset_context); + /* If reset handler not implemented, continue; otherwise return */ + if (r == -ENOSYS) + r = 0; + return r; } @@ -4892,7 +4901,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, /* If reset handler not implemented, continue; otherwise return */ if (r == -ENOSYS) r = 0; - else + else if (!r) /* Mode2 reset successful, return */ return r; /* Reset handler not implemented, use the default method */ @@ -4904,6 +4913,17 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + /* If mode2 reset is enabled, ip suspend is skipped in previous + * amdgpu_device_pre_asic_reset function. but for mode1 reset, + * ip suspend must be called. + */ + if (need_full_reset && + (reset_context->method == AMD_RESET_METHOD_MODE2)) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + amdgpu_device_ip_suspend(tmp_adev); + } + } + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) -- 2.34.1