On 1/13/2025 7:12 AM, Jiang Liu wrote: > Enhance amdgpu_ras_pre_fini() to better support suspend/resume by: > 1) fix possible resource leakage. amdgpu_release_ras_context() only > kfree(con) but doesn't release resources associated with the con > object. > 2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what > has been done by amdgpu_ras_late_init(), because amdgpu_device_resume() > will invoke amdgpu_ras_late_init() on resume. > 3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to > amdgpu_ras_fini() > 4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to > amdgpu_ras_pre_fini(). > > Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++++++++--------- > 2 files changed, 31 insertions(+), 19 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 0a121aab5c74..2bfe113e17c7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4613,6 +4613,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, > return 0; > > release_ras_con: > + amdgpu_ras_pre_fini(adev); > + amdgpu_ras_fini(adev); > if (amdgpu_sriov_vf(adev)) > amdgpu_virt_release_full_gpu(adev, true); > > @@ -4627,8 +4629,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, > adev->virt.ops = NULL; > r = -EAGAIN; > } > - amdgpu_release_ras_context(adev); > - > failed: > amdgpu_vf_error_trans_all(adev); > > @@ -4921,6 +4921,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) > > cancel_delayed_work_sync(&adev->delayed_init_work); > > + /* disable ras feature must before hw fini */ > + amdgpu_ras_pre_fini(adev); > amdgpu_ras_suspend(adev); Based on the usages above, it makes more sense to keep amdgpu_ras_pre_fini as a static function and call in ras_fini/ras_suspend (contain the calls at ras layer and avoid another new public interface). Copying Tao to take a look. Thanks, Lijo > > amdgpu_device_ip_suspend_phase1(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 7bbab7297c97..5ac63f9cffda 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -4270,42 +4270,49 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) > int amdgpu_ras_pre_fini(struct amdgpu_device *adev) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct amdgpu_ras_block_list *node, *tmp; > + struct amdgpu_ras_block_object *obj; > > - if (!adev->ras_enabled || !con) > - return 0; > + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) > + goto disable; > > + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { > + obj = node->ras_obj; > + if (!obj) > + continue; > + > + if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) > + continue; > + > + if (obj->ras_fini) > + obj->ras_fini(adev, &obj->ras_comm); > + else > + amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); > + } > > +disable: > /* Need disable ras on all IPs here before ip [hw/sw]fini */ > - if (AMDGPU_RAS_GET_FEATURES(con->features)) > + if (con && AMDGPU_RAS_GET_FEATURES(con->features)) > amdgpu_ras_disable_all_features(adev, 0); > - amdgpu_ras_recovery_fini(adev); > + > return 0; > } > > int amdgpu_ras_fini(struct amdgpu_device *adev) > { > struct amdgpu_ras_block_list *ras_node, *tmp; > - struct amdgpu_ras_block_object *obj = NULL; > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > if (!adev->ras_enabled || !con) > - return 0; > + goto out_free_context; > > list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { > - if (ras_node->ras_obj) { > - obj = ras_node->ras_obj; > - if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && > - obj->ras_fini) > - obj->ras_fini(adev, &obj->ras_comm); > - else > - amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); > - } > - > /* Clear ras blocks from ras_list and free ras block list node */ > list_del(&ras_node->node); > kfree(ras_node); > } > > + amdgpu_ras_recovery_fini(adev); > amdgpu_ras_fs_fini(adev); > amdgpu_ras_interrupt_remove_all(adev); > > @@ -4323,8 +4330,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) > > cancel_delayed_work_sync(&con->ras_counte_delay_work); > > - amdgpu_ras_set_context(adev, NULL); > - kfree(con); > +out_free_context: > + if (con) { > + amdgpu_ras_set_context(adev, NULL); > + kfree(con); > + } > > return 0; > }