[AMD Official Use Only] > -----Original Message----- > From: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Sent: Monday, February 14, 2022 12:04 PM > To: Quan, Evan <Evan.Quan@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; > rui.huang@xxxxxxx > Subject: Re: [PATCH 07/12] drm/amd/pm: correct the checks for granting gpu > reset APIs > > > > On 2/11/2022 1:22 PM, Evan Quan wrote: > > Those gpu reset APIs can be granted when: > > - System is up and dpm features are enabled. > > - System is under resuming and dpm features are not yet enabled. > > Under such scenario, the PMFW is already alive and can support > > those gpu reset functionalities. > > > > Signed-off-by: Evan Quan <evan.quan@xxxxxxx> > > Change-Id: I8c2f07138921eb53a2bd7fb94f9b3622af0eacf8 > > --- > > .../gpu/drm/amd/include/kgd_pp_interface.h | 1 + > > drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 34 +++++++++++++++ > > .../gpu/drm/amd/pm/powerplay/amd_powerplay.c | 42 > +++++++++++++++---- > > .../drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 1 + > > .../drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c | 17 ++++++++ > > drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h | 1 + > > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 32 +++++++------- > > 7 files changed, 101 insertions(+), 27 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h > > b/drivers/gpu/drm/amd/include/kgd_pp_interface.h > > index a4c267f15959..892648a4a353 100644 > > --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h > > +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h > > @@ -409,6 +409,7 @@ struct amd_pm_funcs { > > struct dpm_clocks *clock_table); > > int (*get_smu_prv_buf_details)(void *handle, void **addr, size_t > *size); > > void (*pm_compute_clocks)(void *handle); > > + bool (*is_smc_alive)(void *handle); > > }; > > > > struct metrics_table_header { > > diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > index b46ae0063047..5f1d3342f87b 100644 > > --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > @@ -120,12 +120,25 @@ int > amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, > uint32_t block > > return ret; > > } > > > > +static bool amdgpu_dpm_is_smc_alive(struct amdgpu_device *adev) { > > + const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; > > + > > + if (!pp_funcs || !pp_funcs->is_smc_alive) > > + return false; > > + > > + return pp_funcs->is_smc_alive; > > +} > > + > > int amdgpu_dpm_baco_enter(struct amdgpu_device *adev) > > { > > const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; > > void *pp_handle = adev->powerplay.pp_handle; > > int ret = 0; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return -EOPNOTSUPP; > > + > > if (!pp_funcs || !pp_funcs->set_asic_baco_state) > > return -ENOENT; > > > > @@ -145,6 +158,9 @@ int amdgpu_dpm_baco_exit(struct amdgpu_device > *adev) > > void *pp_handle = adev->powerplay.pp_handle; > > int ret = 0; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return -EOPNOTSUPP; > > + > > if (!pp_funcs || !pp_funcs->set_asic_baco_state) > > return -ENOENT; > > > > @@ -164,6 +180,9 @@ int amdgpu_dpm_set_mp1_state(struct > amdgpu_device *adev, > > int ret = 0; > > const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return -EOPNOTSUPP; > > + > > if (pp_funcs && pp_funcs->set_mp1_state) { > > mutex_lock(&adev->pm.mutex); > > > > @@ -184,6 +203,9 @@ bool amdgpu_dpm_is_baco_supported(struct > amdgpu_device *adev) > > bool baco_cap; > > int ret = 0; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return false; > > + > > if (!pp_funcs || !pp_funcs->get_asic_baco_capability) > > return false; > > > > @@ -203,6 +225,9 @@ int amdgpu_dpm_mode2_reset(struct > amdgpu_device *adev) > > void *pp_handle = adev->powerplay.pp_handle; > > int ret = 0; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return -EOPNOTSUPP; > > + > > if (!pp_funcs || !pp_funcs->asic_reset_mode_2) > > return -ENOENT; > > > > @@ -221,6 +246,9 @@ int amdgpu_dpm_baco_reset(struct > amdgpu_device *adev) > > void *pp_handle = adev->powerplay.pp_handle; > > int ret = 0; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return -EOPNOTSUPP; > > + > > if (!pp_funcs || !pp_funcs->set_asic_baco_state) > > return -ENOENT; > > > > @@ -244,6 +272,9 @@ bool > amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev) > > struct smu_context *smu = adev->powerplay.pp_handle; > > bool support_mode1_reset = false; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return false; > > + > > if (is_support_sw_smu(adev)) { > > mutex_lock(&adev->pm.mutex); > > support_mode1_reset = > smu_mode1_reset_is_support(smu); @@ -258,6 > > +289,9 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev) > > struct smu_context *smu = adev->powerplay.pp_handle; > > int ret = -EOPNOTSUPP; > > > > + if (!amdgpu_dpm_is_smc_alive(adev)) > > + return -EOPNOTSUPP; > > + > > if (is_support_sw_smu(adev)) { > > mutex_lock(&adev->pm.mutex); > > ret = smu_mode1_reset(smu); > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c > > b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c > > index bba923cfe08c..4c709f7bcd51 100644 > > --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c > > +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c > > @@ -844,9 +844,6 @@ static int pp_dpm_set_mp1_state(void *handle, > enum pp_mp1_state mp1_state) > > if (!hwmgr) > > return -EINVAL; > > > > - if (!hwmgr->pm_en) > > - return 0; > > - > > if (hwmgr->hwmgr_func->set_mp1_state) > > return hwmgr->hwmgr_func->set_mp1_state(hwmgr, > mp1_state); > > > > @@ -1305,8 +1302,7 @@ static int pp_get_asic_baco_capability(void > *handle, bool *cap) > > if (!hwmgr) > > return -EINVAL; > > > > - if (!(hwmgr->not_vf && amdgpu_dpm) || > > - !hwmgr->hwmgr_func->get_asic_baco_capability) > > + if (!hwmgr->hwmgr_func->get_asic_baco_capability) > > return 0; > > > > hwmgr->hwmgr_func->get_asic_baco_capability(hwmgr, cap); @@ - > 1321,7 > > +1317,7 @@ static int pp_get_asic_baco_state(void *handle, int *state) > > if (!hwmgr) > > return -EINVAL; > > > > - if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state) > > + if (!hwmgr->hwmgr_func->get_asic_baco_state) > > return 0; > > > > hwmgr->hwmgr_func->get_asic_baco_state(hwmgr, (enum > BACO_STATE > > *)state); @@ -1336,8 +1332,7 @@ static int pp_set_asic_baco_state(void > *handle, int state) > > if (!hwmgr) > > return -EINVAL; > > > > - if (!(hwmgr->not_vf && amdgpu_dpm) || > > - !hwmgr->hwmgr_func->set_asic_baco_state) > > + if (!hwmgr->hwmgr_func->set_asic_baco_state) > > return 0; > > > > hwmgr->hwmgr_func->set_asic_baco_state(hwmgr, (enum > > BACO_STATE)state); @@ -1379,7 +1374,7 @@ static int > pp_asic_reset_mode_2(void *handle) > > { > > struct pp_hwmgr *hwmgr = handle; > > > > - if (!hwmgr || !hwmgr->pm_en) > > + if (!hwmgr) > > return -EINVAL; > > > > if (hwmgr->hwmgr_func->asic_reset == NULL) { @@ -1517,6 > +1512,34 @@ > > static void pp_pm_compute_clocks(void *handle) > > NULL); > > } > > > > +/* MP Apertures */ > > +#define MP1_Public 0x03b00000 > > +#define smnMP1_FIRMWARE_FLAGS > 0x3010028 > > +#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK > 0x00000001L > > + > > +static bool pp_is_smc_alive(void *handle) { > > + struct pp_hwmgr *hwmgr = handle; > > + struct amdgpu_device *adev = hwmgr->adev; > > + uint32_t mp1_fw_flags; > > + > > + /* > > + * If some ASIC(e.g. smu7/smu8) needs special handling for > > + * checking smc alive, it should have its own implementation > > + * for ->is_smc_alive. > > + */ > > + if (hwmgr->hwmgr_func->is_smc_alive) > > + return hwmgr->hwmgr_func->is_smc_alive(hwmgr); > > + > > + mp1_fw_flags = RREG32_PCIE(MP1_Public | > > + (smnMP1_FIRMWARE_FLAGS & 0xffffffff)); > > + > > The flags check doesn't tell whether PMFW is hung or not. It is a minimal > thing that is set after PMFW boot. To call the API this condition is necessary in > an implicit way. Driver always check this on boot, if not driver aborts smu init. > > So better thing is to go ahead and send the message without any check, it will > tell the result whether PMFW is really working or not. > > In short this API is not needed. [Quan, Evan] It was not designed to cover "PMFW hung". Instead, it was designed to be support early phase of post-silicon bringup. At that time, the SMU may be not enabled/up. We need to prevent this API from wrongly called. BR Evan > > Thanks, > Lijo > > > + if (mp1_fw_flags & > MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) > > + return true; > > + > > + return false; > > +} > > + > > static const struct amd_pm_funcs pp_dpm_funcs = { > > .load_firmware = pp_dpm_load_fw, > > .wait_for_fw_loading_complete = pp_dpm_fw_loading_complete, > @@ > > -1582,4 +1605,5 @@ static const struct amd_pm_funcs pp_dpm_funcs = { > > .gfx_state_change_set = pp_gfx_state_change_set, > > .get_smu_prv_buf_details = pp_get_prv_buffer_details, > > .pm_compute_clocks = pp_pm_compute_clocks, > > + .is_smc_alive = pp_is_smc_alive, > > }; > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c > > b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c > > index a1e11037831a..118039b96524 100644 > > --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c > > +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c > > @@ -5735,6 +5735,7 @@ static const struct pp_hwmgr_func > smu7_hwmgr_funcs = { > > .get_asic_baco_state = smu7_baco_get_state, > > .set_asic_baco_state = smu7_baco_set_state, > > .power_off_asic = smu7_power_off_asic, > > + .is_smc_alive = smu7_is_smc_ram_running, > > }; > > > > uint8_t smu7_get_sleep_divider_id_from_clock(uint32_t clock, diff > > --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c > > b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c > > index b50fd4a4a3d1..fc4d58329f6d 100644 > > --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c > > +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c > > @@ -2015,6 +2015,22 @@ static void smu8_dpm_powergate_vce(struct > pp_hwmgr *hwmgr, bool bgate) > > } > > } > > > > +#define ixMP1_FIRMWARE_FLAGS > 0x3008210 > > +#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK > 0x00000001L > > + > > +static bool smu8_is_smc_running(struct pp_hwmgr *hwmgr) { > > + struct amdgpu_device *adev = hwmgr->adev; > > + uint32_t mp1_fw_flags; > > + > > + mp1_fw_flags = RREG32_SMC(ixMP1_FIRMWARE_FLAGS); > > + > > + if (mp1_fw_flags & > MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) > > + return true; > > + > > + return false; > > +} > > + > > static const struct pp_hwmgr_func smu8_hwmgr_funcs = { > > .backend_init = smu8_hwmgr_backend_init, > > .backend_fini = smu8_hwmgr_backend_fini, @@ -2047,6 +2063,7 > @@ > > static const struct pp_hwmgr_func smu8_hwmgr_funcs = { > > .dynamic_state_management_disable = smu8_disable_dpm_tasks, > > .notify_cac_buffer_info = smu8_notify_cac_buffer_info, > > .get_thermal_temperature_range = > > smu8_get_thermal_temperature_range, > > + .is_smc_alive = smu8_is_smc_running, > > }; > > > > int smu8_init_function_pointers(struct pp_hwmgr *hwmgr) diff --git > > a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h > > b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h > > index 4f7f2f455301..790fc387752c 100644 > > --- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h > > +++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h > > @@ -364,6 +364,7 @@ struct pp_hwmgr_func { > > bool disable); > > ssize_t (*get_gpu_metrics)(struct pp_hwmgr *hwmgr, void **table); > > int (*gfx_state_change)(struct pp_hwmgr *hwmgr, uint32_t state); > > + bool (*is_smc_alive)(struct pp_hwmgr *hwmgr); > > }; > > > > struct pp_table_func { > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > index 8b8feaf7aa0e..27a453fb4db7 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > @@ -1845,9 +1845,6 @@ static int smu_set_mp1_state(void *handle, > > struct smu_context *smu = handle; > > int ret = 0; > > > > - if (!smu->pm_enabled) > > - return -EOPNOTSUPP; > > - > > if (smu->ppt_funcs && > > smu->ppt_funcs->set_mp1_state) > > ret = smu->ppt_funcs->set_mp1_state(smu, mp1_state); > @@ -2513,9 > > +2510,6 @@ static int smu_get_baco_capability(void *handle, bool *cap) > > > > *cap = false; > > > > - if (!smu->pm_enabled) > > - return 0; > > - > > if (smu->ppt_funcs && smu->ppt_funcs->baco_is_support) > > *cap = smu->ppt_funcs->baco_is_support(smu); > > > > @@ -2527,9 +2521,6 @@ static int smu_baco_set_state(void *handle, int > state) > > struct smu_context *smu = handle; > > int ret = 0; > > > > - if (!smu->pm_enabled) > > - return -EOPNOTSUPP; > > - > > if (state == 0) { > > if (smu->ppt_funcs->baco_exit) > > ret = smu->ppt_funcs->baco_exit(smu); @@ -2551,9 > +2542,6 @@ bool > > smu_mode1_reset_is_support(struct smu_context *smu) > > { > > bool ret = false; > > > > - if (!smu->pm_enabled) > > - return false; > > - > > if (smu->ppt_funcs && smu->ppt_funcs->mode1_reset_is_support) > > ret = smu->ppt_funcs->mode1_reset_is_support(smu); > > > > @@ -2564,9 +2552,6 @@ int smu_mode1_reset(struct smu_context *smu) > > { > > int ret = 0; > > > > - if (!smu->pm_enabled) > > - return -EOPNOTSUPP; > > - > > if (smu->ppt_funcs->mode1_reset) > > ret = smu->ppt_funcs->mode1_reset(smu); > > > > @@ -2578,9 +2563,6 @@ static int smu_mode2_reset(void *handle) > > struct smu_context *smu = handle; > > int ret = 0; > > > > - if (!smu->pm_enabled) > > - return -EOPNOTSUPP; > > - > > if (smu->ppt_funcs->mode2_reset) > > ret = smu->ppt_funcs->mode2_reset(smu); > > > > @@ -2712,6 +2694,19 @@ static int smu_get_prv_buffer_details(void > *handle, void **addr, size_t *size) > > return 0; > > } > > > > +static bool smu_is_smc_alive(void *handle) { > > + struct smu_context *smu = handle; > > + > > + if (!smu->ppt_funcs->check_fw_status) > > + return false; > > + > > + if (!smu->ppt_funcs->check_fw_status(smu)) > > + return true; > > + > > + return false; > > +} > > + > > static const struct amd_pm_funcs swsmu_pm_funcs = { > > /* export for sysfs */ > > .set_fan_control_mode = smu_set_fan_control_mode, > > @@ -2765,6 +2760,7 @@ static const struct amd_pm_funcs > swsmu_pm_funcs = { > > .get_uclk_dpm_states = smu_get_uclk_dpm_states, > > .get_dpm_clock_table = smu_get_dpm_clock_table, > > .get_smu_prv_buf_details = smu_get_prv_buffer_details, > > + .is_smc_alive = smu_is_smc_alive, > > }; > > > > int smu_wait_for_event(struct smu_context *smu, enum > smu_event_type > > event, > >