[AMD Official Use Only] > -----Original Message----- > From: Tuikov, Luben <Luben.Tuikov@xxxxxxx> > Sent: Monday, July 12, 2021 11:31 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Tuikov, Luben <Luben.Tuikov@xxxxxxx>; Deucher, Alexander > <Alexander.Deucher@xxxxxxx>; Quan, Evan <Evan.Quan@xxxxxxx> > Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU > > This fixes a bug which if we probe a non-existing > I2C device, and the SMU returns 0xFF, [Quan, Evan] Do we have to probe I2C device via issuing commands to SMU and check existence via SMU response? Is there other more friendly way? >from then on > we can never communicate with the SMU, because the > code before this patch reads and interprets 0xFF > as a terminal error, and thus we never write 0 > into register 90 to clear the status (and > subsequently send a new command to the SMU.) > > It is not an error that the SMU returns status > 0xFF. This means that the SMU executed the last > command successfully (execution status), but the > command result is an error of some sort (execution > result), depending on what the command was. > > When doing a status check of the SMU, before we > send a new command, the only status which > precludes us from sending a new command is 0--the > SMU hasn't finished executing a previous command, > and 0xFC--the SMU is busy. > > This bug was seen as the following line in the > kernel log, > > amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right > state! [Quan, Evan] This was designed to prevent failure scenario from ruin. Via this, we can prevent those SMU command/response related registers overwritten. Then PMFW team can known which command invoked the first error. BR Evan > > when subsequent SMU commands, not necessarily > related to I2C, were sent to the SMU. > > This patch fixes this bug. > > Cc: Alex Deucher <Alexander.Deucher@xxxxxxx> > Cc: Evan Quan <evan.quan@xxxxxxx> > Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state > before issuing message") > Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> > --- > drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196 > +++++++++++++++++++------ > drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 3 +- > 2 files changed, 152 insertions(+), 47 deletions(-) > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > index c902fdf322c1be..775eb50a2e49a6 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > @@ -55,7 +55,7 @@ > > #undef __SMU_DUMMY_MAP > #define __SMU_DUMMY_MAP(type) #type > -static const char* __smu_message_names[] = { > +static const char * const __smu_message_names[] = { > SMU_MESSAGE_TYPES > }; > > @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context > *smu, > *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82); > } > > -int smu_cmn_wait_for_response(struct smu_context *smu) > +/** > + * __smu_cmn_poll_stat -- poll for a status from the SMU > + * smu: a pointer to SMU context > + * > + * Returns the status of the SMU, which could be, > + * 0, the SMU is busy with your previous command; > + * 1, execution status: success, execution result: success; > + * 0xFF, execution status: success, execution result: failure; > + * 0xFE, unknown command; > + * 0xFD, valid command, but bad (command) prerequisites; > + * 0xFC, the command was rejected as the SMU is busy; > + * 0xFB, "SMC_Result_DebugDataDumpEnd". > + */ > +static u32 __smu_cmn_poll_stat(struct smu_context *smu) > { > struct amdgpu_device *adev = smu->adev; > - uint32_t cur_value, i, timeout = adev->usec_timeout * 20; > + int timeout = adev->usec_timeout * 20; > + u32 reg; > > - for (i = 0; i < timeout; i++) { > - cur_value = RREG32_SOC15(MP1, 0, > mmMP1_SMN_C2PMSG_90); > - if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0) > - return cur_value; > + for ( ; timeout > 0; timeout--) { > + reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90); > + if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0) > + break; > > udelay(1); > } > > - /* timeout means wrong logic */ > - if (i == timeout) > - return -ETIME; > - > - return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90); > + return reg; > } > > -int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > - uint16_t msg, uint32_t param) > +static void __smu_cmn_reg_print_error(struct smu_context *smu, > + u32 reg_c2pmsg_90, > + int msg_index, > + u32 param, > + enum smu_message_type msg) > { > struct amdgpu_device *adev = smu->adev; > - int ret; > + const char *message = smu_get_message_name(smu, msg); > > - ret = smu_cmn_wait_for_response(smu); > - if (ret != 0x1) { > - dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and > " > - "SMU may be not in the right state!\n", ret); > - if (ret != -ETIME) > - ret = -EIO; > - return ret; > + switch (reg_c2pmsg_90) { > + case 0: > + dev_err_ratelimited(adev->dev, > + "SMU: I'm not done with your previous > command!"); > + break; > + case 1: > + /* The SMU executed the command. It completed with a > + * successful result. > + */ > + break; > + case 0xFF: > + /* The SMU executed the command. It completed with a > + * unsuccessful result. > + */ > + break; > + case 0xFE: > + dev_err_ratelimited(adev->dev, > + "SMU: unknown command: index:%d > param:0x%08X message:%s", > + msg_index, param, message); > + break; > + case 0xFD: > + dev_err_ratelimited(adev->dev, > + "SMU: valid command, bad prerequisites: > index:%d param:0x%08X message:%s", > + msg_index, param, message); > + break; > + case 0xFC: > + dev_err_ratelimited(adev->dev, > + "SMU: I'm very busy for your command: > index:%d param:0x%08X message:%s", > + msg_index, param, message); > + break; > + case 0xFB: > + dev_err_ratelimited(adev->dev, > + "SMU: I'm debugging!"); > + break; > + default: > + dev_err_ratelimited(adev->dev, > + "SMU: response:0x%08X for index:%d > param:0x%08X message:%s?", > + reg_c2pmsg_90, msg_index, param, > message); > + break; > + } > +} > + > +static int __smu_cmn_reg2errno(struct smu_context *smu, u32 > reg_c2pmsg_90) > +{ > + int res; > + > + switch (reg_c2pmsg_90) { > + case 0: > + res = -ETIME; > + break; > + case 1: > + res = 0; > + break; > + case 0xFF: > + res = -EIO; > + break; > + case 0xFE: > + res = -EOPNOTSUPP; > + break; > + case 0xFD: > + res = -EIO; > + break; > + case 0xFC: > + res = -EBUSY; > + break; > + case 0xFB: > + res = -EIO; > + break; > + default: > + res = -EIO; > + break; > } > > + return res; > +} > + > +static void __smu_cmn_send_msg(struct smu_context *smu, > + u16 msg, > + u32 param) > +{ > + struct amdgpu_device *adev = smu->adev; > + > WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0); > WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param); > WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg); > +} > > - return 0; > +int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > + uint16_t msg_index, > + uint32_t param) > +{ > + u32 reg; > + int res; > + > + if (smu->adev->in_pci_err_recovery) > + return 0; > + > + mutex_lock(&smu->message_lock); > + reg = __smu_cmn_poll_stat(smu); > + if (reg == 0 || reg == 0xFC) { > + res = __smu_cmn_reg2errno(smu, reg); > + goto Out; > + } > + __smu_cmn_send_msg(smu, msg_index, param); > + res = 0; > +Out: > + mutex_unlock(&smu->message_lock); > + return res; > +} > + > +int smu_cmn_wait_for_response(struct smu_context *smu) > +{ > + u32 reg; > + > + reg = __smu_cmn_poll_stat(smu); > + return __smu_cmn_reg2errno(smu, reg); > } > > int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct > smu_context *smu, > uint32_t param, > uint32_t *read_arg) > { > - struct amdgpu_device *adev = smu->adev; > - int ret = 0, index = 0; > + int res, index; > + u32 reg; > > if (smu->adev->in_pci_err_recovery) > return 0; > @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct > smu_context *smu, > return index == -EACCES ? 0 : index; > > mutex_lock(&smu->message_lock); > - ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, > param); > - if (ret) > - goto out; > - > - ret = smu_cmn_wait_for_response(smu); > - if (ret != 0x1) { > - if (ret == -ETIME) { > - dev_err(adev->dev, "message: %15s (%d) \tparam: > 0x%08x is timeout (no response)\n", > - smu_get_message_name(smu, msg), index, > param); > - } else { > - dev_err(adev->dev, "failed send message: %15s (%d) > \tparam: 0x%08x response %#x\n", > - smu_get_message_name(smu, msg), index, > param, > - ret); > - ret = -EIO; > - } > - goto out; > + reg = __smu_cmn_poll_stat(smu); > + if (reg == 0 || reg == 0xFC) { > + res = __smu_cmn_reg2errno(smu, reg); > + __smu_cmn_reg_print_error(smu, reg, index, param, msg); > + goto Out; > } > - > + __smu_cmn_send_msg(smu, (uint16_t) index, param); > + reg = __smu_cmn_poll_stat(smu); > + res = __smu_cmn_reg2errno(smu, reg); > if (read_arg) > smu_cmn_read_arg(smu, read_arg); > - > - ret = 0; /* 0 as driver return value */ > -out: > +Out: > mutex_unlock(&smu->message_lock); > - return ret; > + return res; > } > > int smu_cmn_send_smc_msg(struct smu_context *smu, > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > index 9add5f16ff562a..16993daa2ae042 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > @@ -27,7 +27,8 @@ > > #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3) > || defined(SWSMU_CODE_LAYER_L4) > int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > - uint16_t msg, uint32_t param); > + uint16_t msg_index, > + uint32_t param); > int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > enum smu_message_type msg, > uint32_t param, > -- > 2.32.0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx