Enhance amdgpu_ras_pre_fini() to better support suspend/resume by: 1) fix possible resource leakage. amdgpu_release_ras_context() only kfree(con) but doesn't release resources associated with the con object. 2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what has been done by amdgpu_ras_late_init(), because amdgpu_device_resume() will invoke amdgpu_ras_late_init() on resume. 3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to amdgpu_ras_fini() 4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to amdgpu_ras_pre_fini(). Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 45 ++++++++++++++-------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 602322bd7cb8..7ee8f9d73c3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4599,6 +4599,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, return 0; release_ras_con: + amdgpu_ras_pre_fini(adev); + amdgpu_ras_fini(adev); if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, true); @@ -4613,8 +4615,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->virt.ops = NULL; r = -EAGAIN; } - amdgpu_release_ras_context(adev); - failed: amdgpu_vf_error_trans_all(adev); @@ -4875,6 +4875,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) cancel_delayed_work_sync(&adev->delayed_init_work); + /* disable ras feature must before hw fini */ + amdgpu_ras_pre_fini(adev); amdgpu_ras_suspend(adev); amdgpu_device_ip_suspend_phase1(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 040969d56541..bc90035ee25b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4064,42 +4064,50 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; - if (!adev->ras_enabled || !con) - return 0; + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) + goto disable; + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + obj = node->ras_obj; + if (!obj) { + continue; + } + + if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) + continue; + + if (obj->ras_fini) { + obj->ras_fini(adev, &obj->ras_comm); + } else + amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); + } +disable: /* Need disable ras on all IPs here before ip [hw/sw]fini */ - if (AMDGPU_RAS_GET_FEATURES(con->features)) + if (con && AMDGPU_RAS_GET_FEATURES(con->features)) amdgpu_ras_disable_all_features(adev, 0); - amdgpu_ras_recovery_fini(adev); + return 0; } int amdgpu_ras_fini(struct amdgpu_device *adev) { struct amdgpu_ras_block_list *ras_node, *tmp; - struct amdgpu_ras_block_object *obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); if (!adev->ras_enabled || !con) - return 0; + goto out_free_context; list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { - if (ras_node->ras_obj) { - obj = ras_node->ras_obj; - if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && - obj->ras_fini) - obj->ras_fini(adev, &obj->ras_comm); - else - amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); - } - /* Clear ras blocks from ras_list and free ras block list node */ list_del(&ras_node->node); kfree(ras_node); } + amdgpu_ras_recovery_fini(adev); amdgpu_ras_fs_fini(adev); amdgpu_ras_interrupt_remove_all(adev); @@ -4117,8 +4125,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) cancel_delayed_work_sync(&con->ras_counte_delay_work); - amdgpu_ras_set_context(adev, NULL); - kfree(con); +out_free_context: + if (con) { + amdgpu_ras_set_context(adev, NULL); + kfree(con); + } return 0; } -- 2.43.5