On 9/29/2024 4:18 PM, jiadong.zhu@xxxxxxx wrote: > From: Jiadong Zhu <Jiadong.Zhu@xxxxxxx> > > Implement sdma soft reset by sending MSG_ResetSDMA on smu 13.0.6. > > Signed-off-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx> > --- > drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 15 +++++++++++++++ > drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 + > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 ++++++++++ > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 6 ++++++ > drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 ++- > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 16 ++++++++++++++++ > 6 files changed, 50 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > index 9dc82f4d7c93..9e7a652d119b 100644 > --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > @@ -700,6 +700,21 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev) > return ret; > } > > +int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask) > +{ > + struct smu_context *smu = adev->powerplay.pp_handle; > + int ret; > + > + if (!is_support_sw_smu(adev)) > + return -EOPNOTSUPP; > + > + mutex_lock(&adev->pm.mutex); > + ret = smu_reset_sdma(smu, inst_mask); > + mutex_unlock(&adev->pm.mutex); > + > + return ret; > +} > + > int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev, > enum pp_clock_type type, > uint32_t *min, > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > index f5bf41f21c41..41fb6ef984bf 100644 > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > @@ -597,5 +597,6 @@ int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, int policy_type, > int policy_level); > ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev, > enum pp_pm_policy p_type, char *buf); > +int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask); > > #endif > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > index 4a6b4ad97f06..590d004046ef 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > @@ -3820,3 +3820,13 @@ int smu_send_rma_reason(struct smu_context *smu) > > return ret; > } > + > +int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask) > +{ > + int ret = 0; > + > + if (smu->ppt_funcs && smu->ppt_funcs->reset_sdma) > + ret = smu->ppt_funcs->reset_sdma(smu, inst_mask); > + > + return ret; > +} > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > index b44a185d07e8..5487d9d84a4d 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > @@ -1371,6 +1371,11 @@ struct pptable_funcs { > */ > int (*send_rma_reason)(struct smu_context *smu); > > + /** > + * @reset_sdma: message SMU to soft reset sdma instance. > + */ > + int (*reset_sdma)(struct smu_context *smu, uint32_t inst_mask); > + > /** > * @get_ecc_table: message SMU to get ECC INFO table. > */ > @@ -1630,6 +1635,7 @@ void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device *adev); > int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size); > int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size); > int smu_send_rma_reason(struct smu_context *smu); > +int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask); > int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type, > int level); > ssize_t smu_get_pm_policy_info(struct smu_context *smu, > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > index e71a721c12b9..855eb57c734d 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > @@ -275,7 +275,8 @@ > __SMU_DUMMY_MAP(RmaDueToBadPageThreshold), \ > __SMU_DUMMY_MAP(SelectPstatePolicy), \ > __SMU_DUMMY_MAP(MALLPowerController), \ > - __SMU_DUMMY_MAP(MALLPowerState), > + __SMU_DUMMY_MAP(MALLPowerState), \ > + __SMU_DUMMY_MAP(ResetSDMA), > > #undef __SMU_DUMMY_MAP > #define __SMU_DUMMY_MAP(type) SMU_MSG_##type > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > index 52f3c537bb3f..6ca9bc6660cb 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > @@ -182,6 +182,7 @@ static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COU > MSG_MAP(SelectPLPDMode, PPSMC_MSG_SelectPLPDMode, 0), > MSG_MAP(RmaDueToBadPageThreshold, PPSMC_MSG_RmaDueToBadPageThreshold, 0), > MSG_MAP(SelectPstatePolicy, PPSMC_MSG_SelectPstatePolicy, 0), > + MSG_MAP(ResetSDMA, PPSMC_MSG_ResetSDMA, 0), > }; > > // clang-format on > @@ -2697,6 +2698,20 @@ static int smu_v13_0_6_send_rma_reason(struct smu_context *smu) > return ret; > } > > +static int smu_v13_0_6_reset_sdma(struct smu_context *smu, uint32_t inst_mask) > +{ > + int ret = 0; > + > + ret = smu_cmn_send_smc_msg_with_param(smu, > + SMU_MSG_ResetSDMA, inst_mask, NULL); This will need a check of FW version and IP version. Not all FWs support reset and this file also handles other SMU 13.0.x IP versions. If not, the below message will always show up even if unsupported which could be misleading. Thanks, Lijo > + if (ret) > + dev_err(smu->adev->dev, > + "[%s] failed to send ResetSDMA event to SMU\n", > + __func__); > + > + return ret; > +} > + > static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) > { > struct smu_context *smu = adev->powerplay.pp_handle; > @@ -3342,6 +3357,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { > .i2c_fini = smu_v13_0_6_i2c_control_fini, > .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num, > .send_rma_reason = smu_v13_0_6_send_rma_reason, > + .reset_sdma = smu_v13_0_6_reset_sdma, > }; > > void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)