On 10/9/2024 9:15 AM, jiadong.zhu@xxxxxxx wrote: > From: Jiadong Zhu <Jiadong.Zhu@xxxxxxx> > > Implement sdma soft reset by sending MSG_ResetSDMA on smu 13.0.6. > > v2: add firmware version for the reset message. > > Signed-off-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx> > --- > drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 15 +++++++++++++ > drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 + > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +++++++++ > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 6 ++++++ > drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 ++- > .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 21 +++++++++++++++++++ > 6 files changed, 55 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > index 9dc82f4d7c93..9e7a652d119b 100644 > --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > @@ -700,6 +700,21 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev) > return ret; > } > > +int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask) > +{ > + struct smu_context *smu = adev->powerplay.pp_handle; > + int ret; > + > + if (!is_support_sw_smu(adev)) > + return -EOPNOTSUPP; > + > + mutex_lock(&adev->pm.mutex); > + ret = smu_reset_sdma(smu, inst_mask); > + mutex_unlock(&adev->pm.mutex); > + > + return ret; > +} > + > int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev, > enum pp_clock_type type, > uint32_t *min, > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > index f5bf41f21c41..41fb6ef984bf 100644 > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > @@ -597,5 +597,6 @@ int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, int policy_type, > int policy_level); > ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev, > enum pp_pm_policy p_type, char *buf); > +int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask); > > #endif > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > index 4a6b4ad97f06..590d004046ef 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > @@ -3820,3 +3820,13 @@ int smu_send_rma_reason(struct smu_context *smu) > > return ret; > } > + > +int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask) > +{ > + int ret = 0; > + > + if (smu->ppt_funcs && smu->ppt_funcs->reset_sdma) > + ret = smu->ppt_funcs->reset_sdma(smu, inst_mask); > + > + return ret; > +} > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > index b44a185d07e8..5487d9d84a4d 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > @@ -1371,6 +1371,11 @@ struct pptable_funcs { > */ > int (*send_rma_reason)(struct smu_context *smu); > > + /** > + * @reset_sdma: message SMU to soft reset sdma instance. > + */ > + int (*reset_sdma)(struct smu_context *smu, uint32_t inst_mask); > + > /** > * @get_ecc_table: message SMU to get ECC INFO table. > */ > @@ -1630,6 +1635,7 @@ void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device *adev); > int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size); > int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size); > int smu_send_rma_reason(struct smu_context *smu); > +int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask); > int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type, > int level); > ssize_t smu_get_pm_policy_info(struct smu_context *smu, > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > index e71a721c12b9..855eb57c734d 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > @@ -275,7 +275,8 @@ > __SMU_DUMMY_MAP(RmaDueToBadPageThreshold), \ > __SMU_DUMMY_MAP(SelectPstatePolicy), \ > __SMU_DUMMY_MAP(MALLPowerController), \ > - __SMU_DUMMY_MAP(MALLPowerState), > + __SMU_DUMMY_MAP(MALLPowerState), \ > + __SMU_DUMMY_MAP(ResetSDMA), > > #undef __SMU_DUMMY_MAP > #define __SMU_DUMMY_MAP(type) SMU_MSG_##type > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > index 52f3c537bb3f..42c38ced209c 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > @@ -182,6 +182,7 @@ static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COU > MSG_MAP(SelectPLPDMode, PPSMC_MSG_SelectPLPDMode, 0), > MSG_MAP(RmaDueToBadPageThreshold, PPSMC_MSG_RmaDueToBadPageThreshold, 0), > MSG_MAP(SelectPstatePolicy, PPSMC_MSG_SelectPstatePolicy, 0), > + MSG_MAP(ResetSDMA, PPSMC_MSG_ResetSDMA, 0), > }; > > // clang-format on > @@ -2697,6 +2698,25 @@ static int smu_v13_0_6_send_rma_reason(struct smu_context *smu) > return ret; > } > > +static int smu_v13_0_6_reset_sdma(struct smu_context *smu, uint32_t inst_mask) > +{ > + struct amdgpu_device *adev = smu->adev; > + int ret = 0; > + > + /* the message is only valid on dGPU with pmfw 85.116.110 and above */ > + if ((adev->flags & AMD_IS_APU) || smu->smc_fw_version < 0x0055746E) This will need IP version check as this file also supports 13.0.14 which has a different FW version. > + return 0; > + > + ret = smu_cmn_send_smc_msg_with_param(smu, > + SMU_MSG_ResetSDMA, inst_mask, NULL); > + if (ret) > + dev_err(smu->adev->dev, > + "[%s] failed to send ResetSDMA event to SMU\n", > + __func__); Mostly, we will be interested to see the SDMA instance mask for which the failure happened rather than a function name. That function name is not necessary. Thanks, Lijo > + > + return ret; > +} > + > static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) > { > struct smu_context *smu = adev->powerplay.pp_handle; > @@ -3342,6 +3362,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { > .i2c_fini = smu_v13_0_6_i2c_control_fini, > .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num, > .send_rma_reason = smu_v13_0_6_send_rma_reason, > + .reset_sdma = smu_v13_0_6_reset_sdma, > }; > > void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)