On 1/13/2025 7:12 AM, Jiang Liu wrote: > Make the device state machine work in stack like way to better support > suspend/resume by following changes: > > 1. amdgpu_driver_load_kms() > amdgpu_device_init() > amdgpu_device_ip_early_init() > ip_blocks[i].early_init() > ip_blocks[i].status.valid = true > amdgpu_device_ip_init() > amdgpu_ras_init() > ip_blocks[i].sw_init() > ip_blocks[i].status.sw = true > ip_blocks[i].hw_init() > ip_blocks[i].status.hw = true > amdgpu_device_ip_late_init() > ip_blocks[i].late_init() > ip_blocks[i].status.late_initialized = true > amdgpu_ras_late_init() > ras_blocks[i].ras_late_init() > amdgpu_ras_feature_enable_on_boot() > > 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff() > amdgpu_device_suspend() > amdgpu_ras_early_fini() > ras_blocks[i].ras_early_fini() > amdgpu_ras_feature_disable() > amdgpu_ras_suspend() > amdgpu_ras_disable_all_features() > +++ ip_blocks[i].early_fini() > +++ ip_blocks[i].status.late_initialized = false As said in the previous patch, please don't add confusion. You could maintain a state machine like early fini done/late fini done etc, but please don't introduce this kind of confusing things. Thanks, Lijo > ip_blocks[i].suspend() > > 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore() > amdgpu_device_resume() > amdgpu_device_ip_resume() > ip_blocks[i].resume() > amdgpu_device_ip_late_init() > ip_blocks[i].late_init() > ip_blocks[i].status.late_initialized = true > amdgpu_ras_late_init() > ras_blocks[i].ras_late_init() > amdgpu_ras_feature_enable_on_boot() > amdgpu_ras_resume() > amdgpu_ras_enable_all_features() > > 4. amdgpu_driver_unload_kms() > amdgpu_device_fini_hw() > amdgpu_ras_early_fini() > ras_blocks[i].ras_early_fini() > +++ ip_blocks[i].early_fini() > +++ ip_blocks[i].status.late_initialized = false > ip_blocks[i].hw_fini() > ip_blocks[i].status.hw = false > > 5. amdgpu_driver_release_kms() > amdgpu_device_fini_sw() > amdgpu_device_ip_fini() > ip_blocks[i].sw_fini() > ip_blocks[i].status.sw = false > --- ip_blocks[i].status.valid = false > +++ amdgpu_ras_fini() > ip_blocks[i].late_fini() > +++ ip_blocks[i].status.valid = false > --- ip_blocks[i].status.late_initialized = false > --- amdgpu_ras_fini() > > The main changes include: > 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend(). > 2) set ip_blocks[i].status.late_initialized to false after calling > callback `early_fini`. We have auditted all usages of the > late_initialized flag and no functional changes found. > 3) only set ip_blocks[i].status.valid = false after calling the > `late_fini` callback. > 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini. > > There's one more task left to analyze GPU reset related state machine > transitions. > > Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++-- > 1 file changed, 20 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 6b503fb7e366..c2e4057ecd82 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) > adev->ip_blocks[i].status.sw = false; > } > > + amdgpu_ras_fini(adev); > + > for (i = adev->num_ip_blocks - 1; i >= 0; i--) { > if (!adev->ip_blocks[i].status.valid) > continue; > @@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) > adev->ip_blocks[i].status.valid = false; > } > > - amdgpu_ras_fini(adev); > - > return 0; > } > > @@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) > if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) > dev_warn(adev->dev, "Failed to disallow df cstate"); > > + for (i = adev->num_ip_blocks - 1; i >= 0; i--) { > + if (!adev->ip_blocks[i].status.valid) > + continue; > + if (!adev->ip_blocks[i].status.late_initialized) > + continue; > + > + if (adev->ip_blocks[i].version->funcs->early_fini) { > + r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); > + if (r) { > + DRM_ERROR(" of IP block <%s> failed %d\n", > + adev->ip_blocks[i].version->funcs->name, r); > + return r; > + } > + } > + > + adev->ip_blocks[i].status.late_initialized = false; > + } > + > for (i = adev->num_ip_blocks - 1; i >= 0; i--) { > if (!adev->ip_blocks[i].status.valid) > continue;