On 2/17/2025 3:26 PM, Lazar, Lijo wrote: > > > On 2/12/2025 9:41 PM, Alex Deucher wrote: >> From: "chr[]" <chris@xxxxxxxxxxx> >> >> resume and irq handler happily races in set_power_state() >> >> * amdgpu_legacy_dpm_compute_clocks() needs lock >> * protect irq work handler >> * fix dpm_enabled usage >> >> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2524 >> Fixes: 3712e7a49459 ("drm/amd/pm: unified lock protections in amdgpu_dpm.c") >> Signed-off-by: chr[] <chris@xxxxxxxxxxx> >> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> >> --- >> drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c | 19 ++++++++++++++---- >> .../gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c | 2 ++ >> drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 20 +++++++++++++++---- >> 3 files changed, 33 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c >> index 67a8e22b1126d..db0d3d4dba685 100644 >> --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c >> +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c >> @@ -3042,6 +3042,7 @@ static int kv_dpm_hw_init(struct amdgpu_ip_block *ip_block) >> if (!amdgpu_dpm) >> return 0; >> >> + mutex_lock(&adev->pm.mutex); >> kv_dpm_setup_asic(adev); >> ret = kv_dpm_enable(adev); >> if (ret) >> @@ -3049,6 +3050,8 @@ static int kv_dpm_hw_init(struct amdgpu_ip_block *ip_block) >> else >> adev->pm.dpm_enabled = true; >> amdgpu_legacy_dpm_compute_clocks(adev); >> + mutex_unlock(&adev->pm.mutex); >> + >> return ret; >> } >> >> @@ -3067,10 +3070,13 @@ static int kv_dpm_suspend(struct amdgpu_ip_block *ip_block) >> struct amdgpu_device *adev = ip_block->adev; >> >> if (adev->pm.dpm_enabled) { >> + mutex_lock(&adev->pm.mutex); >> + adev->pm.dpm_enabled = false; >> /* disable dpm */ >> kv_dpm_disable(adev); >> /* reset the power state */ >> adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; >> + mutex_unlock(&adev->pm.mutex); >> } >> return 0; >> } >> @@ -3080,18 +3086,23 @@ static int kv_dpm_resume(struct amdgpu_ip_block *ip_block) >> int ret; >> struct amdgpu_device *adev = ip_block->adev; >> >> - if (adev->pm.dpm_enabled) { >> + if (!amdgpu_dpm) >> + return 0; >> + >> + if (!adev->pm.dpm_enabled) { >> + mutex_lock(&adev->pm.mutex); >> /* asic init will reset to the boot state */ >> kv_dpm_setup_asic(adev); >> ret = kv_dpm_enable(adev); >> if (ret) >> adev->pm.dpm_enabled = false; > > Need braces here as well since else part is with braces. >> - else >> + else { >> adev->pm.dpm_enabled = true; >> - if (adev->pm.dpm_enabled) >> amdgpu_legacy_dpm_compute_clocks(adev); >> + } >> + mutex_unlock(&adev->pm.mutex); >> } >> - return 0; >> + return ret; > > Possibility of returning unintialized value. > >> } >> >> static bool kv_dpm_is_idle(void *handle) >> diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c >> index e861355ebd75b..e06c25c945007 100644 >> --- a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c >> +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c >> @@ -1012,6 +1012,7 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) >> if (!adev->pm.dpm_enabled) >> return; >> >> + mutex_lock(&adev->pm.mutex); > > I think taking this lock should happen before the dpm_enabled variable > check. Otherwise, it could happen that part of this sequence might still > go through after a kv/si_dpm_suspend. One more comment - probably it's worthwhile to cancel any scheduled amdgpu_dpm_thermal_work at the beginning of *_dpm_suspend. Thanks, Lijo > >> if (!pp_funcs->read_sensor(adev->powerplay.pp_handle, >> AMDGPU_PP_SENSOR_GPU_TEMP, >> (void *)&temp, >> @@ -1033,4 +1034,5 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) >> adev->pm.dpm.state = dpm_state; >> >> amdgpu_legacy_dpm_compute_clocks(adev->powerplay.pp_handle); >> + mutex_unlock(&adev->pm.mutex); >> } >> diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c >> index a87dcf0974bc1..c40ec4f74afb0 100644 >> --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c >> +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c >> @@ -7786,6 +7786,7 @@ static int si_dpm_hw_init(struct amdgpu_ip_block *ip_block) >> if (!amdgpu_dpm) >> return 0; >> >> + mutex_lock(&adev->pm.mutex); >> si_dpm_setup_asic(adev); >> ret = si_dpm_enable(adev); >> if (ret) >> @@ -7793,6 +7794,7 @@ static int si_dpm_hw_init(struct amdgpu_ip_block *ip_block) >> else >> adev->pm.dpm_enabled = true; >> amdgpu_legacy_dpm_compute_clocks(adev); >> + mutex_unlock(&adev->pm.mutex); >> return ret; >> } >> >> @@ -7811,11 +7813,15 @@ static int si_dpm_suspend(struct amdgpu_ip_block *ip_block) >> struct amdgpu_device *adev = ip_block->adev; >> >> if (adev->pm.dpm_enabled) { >> + mutex_lock(&adev->pm.mutex); >> + adev->pm.dpm_enabled = false; >> /* disable dpm */ >> si_dpm_disable(adev); >> /* reset the power state */ >> adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; >> + mutex_unlock(&adev->pm.mutex); >> } >> + >> return 0; >> } >> >> @@ -7824,18 +7830,24 @@ static int si_dpm_resume(struct amdgpu_ip_block *ip_block) >> int ret; >> struct amdgpu_device *adev = ip_block->adev; >> >> - if (adev->pm.dpm_enabled) { >> + if (!amdgpu_dpm) >> + return 0; >> + >> + if (!adev->pm.dpm_enabled) { >> /* asic init will reset to the boot state */ >> + mutex_lock(&adev->pm.mutex); >> si_dpm_setup_asic(adev); >> ret = si_dpm_enable(adev); >> if (ret) >> adev->pm.dpm_enabled = false; > > Same 'braces' comment. > >> - else >> + else { >> adev->pm.dpm_enabled = true; >> - if (adev->pm.dpm_enabled) >> amdgpu_legacy_dpm_compute_clocks(adev); >> + } >> + mutex_unlock(&adev->pm.mutex); >> } >> - return 0; >> + >> + return ret; > > Possibility of returning unintialized value. > > Thanks, > Lijo > >> } >> >> static bool si_dpm_is_idle(void *handle) >