On 7/8/2024 7:01 PM, Danijel Slivka wrote: > Why: > If the reg mmMP1_SMN_C2PMSG_90 is being written to during amdgpu driver > load or driver unload, subsequent amdgpu driver load will fail at > smu_hw_init. The default of mmMP1_SMN_C2PMSG_90 register at a clean > environment is 0x1 and if value differs from expected, amdgpu driver > load will fail. > > How to fix: > Ignore the initial value in smu response register before the first smu > message is sent,if smc in SMU_FW_INIT state, just proceed further to > send the message. If register holds an unexpected value after smu message > was sent set, smc_state to SMU_FW_HANG state and no further smu messages > will be sent. > > v2: > Set SMU_FW_INIT state at the start of smu hw_init/resume. > Check smc_fw_state before sending smu message if in hang state skip > sending message. > Set SMU_FW_HANG only in case unexpected value is detected > > Signed-off-by: Danijel Slivka <danijel.slivka@xxxxxxx> Patch looks good to me Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> Copying Kenneth/Kevin as well. Thanks, Lijo > --- > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 ++ > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 7 ++++ > drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 34 ++++++++++++++++--- > 3 files changed, 38 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > index d79bdb1e8cdf..fb8643d25d1b 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > @@ -1755,6 +1755,8 @@ static int smu_start_smc_engine(struct smu_context *smu) > struct amdgpu_device *adev = smu->adev; > int ret = 0; > > + smu->smc_fw_state = SMU_FW_INIT; > + > if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) { > if (amdgpu_ip_version(adev, MP1_HWIP, 0) < IP_VERSION(11, 0, 0)) { > if (smu->ppt_funcs->load_microcode) { > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > index a34c802f52be..b44a185d07e8 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > @@ -495,6 +495,12 @@ struct stb_context { > spinlock_t lock; > }; > > +enum smu_fw_status { > + SMU_FW_INIT = 0, > + SMU_FW_RUNTIME, > + SMU_FW_HANG, > +}; > + > #define WORKLOAD_POLICY_MAX 7 > > /* > @@ -562,6 +568,7 @@ struct smu_context { > uint32_t smc_fw_if_version; > uint32_t smc_fw_version; > uint32_t smc_fw_caps; > + uint8_t smc_fw_state; > > bool uploading_custom_pp_table; > bool dc_controlled_by_gpio; > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > index 5592fd825aa3..d7c983a1f3f5 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > @@ -315,11 +315,20 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > if (adev->no_hw_access) > return 0; > > - reg = __smu_cmn_poll_stat(smu); > - res = __smu_cmn_reg2errno(smu, reg); > - if (reg == SMU_RESP_NONE || > - res == -EREMOTEIO) > + if (smu->smc_fw_state == SMU_FW_HANG) { > + dev_err(adev->dev, "SMU is in hanged state, failed to send smu message!\n"); > goto Out; > + } > + > + if (smu->smc_fw_state == SMU_FW_INIT) { > + smu->smc_fw_state = SMU_FW_RUNTIME; > + } else { > + reg = __smu_cmn_poll_stat(smu); > + res = __smu_cmn_reg2errno(smu, reg); > + if (reg == SMU_RESP_NONE || res == -EREMOTEIO) > + goto Out; > + } > + > __smu_cmn_send_msg(smu, msg_index, param); > res = 0; > Out: > @@ -350,6 +359,9 @@ int smu_cmn_wait_for_response(struct smu_context *smu) > reg = __smu_cmn_poll_stat(smu); > res = __smu_cmn_reg2errno(smu, reg); > > + if (res == -EREMOTEIO) > + smu->smc_fw_state = SMU_FW_HANG; > + > if (unlikely(smu->adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && > res && (res != -ETIME)) { > amdgpu_device_halt(smu->adev); > @@ -418,6 +430,15 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > goto Out; > } > > + if (smu->smc_fw_state == SMU_FW_HANG) { > + dev_err(adev->dev, "SMU is in hanged state, failed to send smu message!\n"); > + goto Out; > + } else if (smu->smc_fw_state == SMU_FW_INIT) { > + /* Ignore initial smu response register value */ > + poll = false; > + smu->smc_fw_state = SMU_FW_RUNTIME; > + } > + > if (poll) { > reg = __smu_cmn_poll_stat(smu); > res = __smu_cmn_reg2errno(smu, reg); > @@ -429,8 +450,11 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > __smu_cmn_send_msg(smu, (uint16_t) index, param); > reg = __smu_cmn_poll_stat(smu); > res = __smu_cmn_reg2errno(smu, reg); > - if (res != 0) > + if (res != 0) { > + if (res == -EREMOTEIO) > + smu->smc_fw_state = SMU_FW_HANG; > __smu_cmn_reg_print_error(smu, reg, index, param, msg); > + } > if (read_arg) { > smu_cmn_read_arg(smu, read_arg); > dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, resp: 0x%08x,\