When a RAS fatal error is detected, PMFW will only process priority messages. Other messages won't be taken up for processing and therefore won't get any response in such a state. Add logic to filter out non-priority messages when RAS error is detected. Also, don't poll response response status register before sending priority messages. Use firmware capability flag to determine whether to filter priority messages. Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 65 +++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index 3227e514e8ae..6d1c3af927ca 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -235,6 +235,50 @@ static void __smu_cmn_send_msg(struct smu_context *smu, WREG32(smu->msg_reg, msg); } +static inline uint32_t __smu_cmn_get_msg_flags(struct smu_context *smu, + enum smu_message_type msg) +{ + return smu->message_map[msg].flags; +} + +static int __smu_cmn_ras_filter_msg(struct smu_context *smu, + enum smu_message_type msg, bool *poll) +{ + struct amdgpu_device *adev = smu->adev; + uint32_t flags, resp; + bool fed_status; + + flags = __smu_cmn_get_msg_flags(smu, msg); + *poll = true; + + /* When there is RAS fatal error, FW won't process non-RAS priority + * messages. Don't allow any messages other than RAS priority messages. + */ + fed_status = amdgpu_ras_get_fed_status(adev); + if (fed_status) { + if (!(flags & SMU_MSG_RAS_PRI)) { + dev_dbg(adev->dev, + "RAS error detected, skip sending %s", + smu_get_message_name(smu, msg)); + return -EACCES; + } + + /* FW will ignore non-priority messages when a RAS fatal error + * is detected. Hence it is possible that a previous message + * wouldn't have got response. Allow to continue without polling + * for response status for priority messages. + */ + resp = RREG32(smu->resp_reg); + dev_dbg(adev->dev, + "Sending RAS priority message %s response status: %x", + smu_get_message_name(smu, msg), resp); + if (resp == 0) + *poll = false; + } + + return 0; +} + static int __smu_cmn_send_debug_msg(struct smu_context *smu, u32 msg, u32 param) @@ -354,6 +398,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, { struct amdgpu_device *adev = smu->adev; int res, index; + bool poll = true; u32 reg; if (adev->no_hw_access) @@ -366,12 +411,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, return index == -EACCES ? 0 : index; mutex_lock(&smu->message_lock); - reg = __smu_cmn_poll_stat(smu); - res = __smu_cmn_reg2errno(smu, reg); - if (reg == SMU_RESP_NONE || - res == -EREMOTEIO) { - __smu_cmn_reg_print_error(smu, reg, index, param, msg); - goto Out; + + if (smu->smc_fw_caps & SMU_FW_CAP_RAS_PRI) { + res = __smu_cmn_ras_filter_msg(smu, msg, &poll); + if (res) + goto Out; + } + + if (poll) { + reg = __smu_cmn_poll_stat(smu); + res = __smu_cmn_reg2errno(smu, reg); + if (reg == SMU_RESP_NONE || res == -EREMOTEIO) { + __smu_cmn_reg_print_error(smu, reg, index, param, msg); + goto Out; + } } __smu_cmn_send_msg(smu, (uint16_t) index, param); reg = __smu_cmn_poll_stat(smu); -- 2.25.1