Just realized that the patch I pasted won't work. Outer loop exit needs to be like this. (reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >= 0 Anyway, that patch is only there to communicate what I really meant in the earlier comment. Thanks, Lijo -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lazar, Lijo Sent: Wednesday, December 1, 2021 10:44 AM To: Yu, Lang <Lang.Yu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Huang, Ray <Ray.Huang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support On 11/30/2021 10:47 AM, Lang Yu wrote: > To maintain system error state when SMU errors occurred, which will > aid in debugging SMU firmware issues, add SMU debug option support. > > It can be enabled or disabled via amdgpu_smu_debug debugfs file. When > enabled, it makes SMU errors fatal. > It is disabled by default. > > == Command Guide == > > 1, enable SMU debug option > > # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug > > 2, disable SMU debug option > > # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug > > v2: > - Resend command when timeout.(Lijo) > - Use debugfs file instead of module parameter. > > Signed-off-by: Lang Yu <lang.yu@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++ > drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 39 +++++++++++++++++++-- > 2 files changed, 69 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 164d6a9e9fbb..f9412de86599 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -39,6 +39,8 @@ > > #if defined(CONFIG_DEBUG_FS) > > +extern int amdgpu_smu_debug; > + > /** > * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes > * > @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf, > return result; > } > > + > + > static const struct file_operations amdgpu_debugfs_regs2_fops = { > .owner = THIS_MODULE, > .unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26 > @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL, > DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL, > amdgpu_debugfs_sclk_set, "%llu\n"); > > +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) { > + *val = amdgpu_smu_debug; > + return 0; > +} > + > +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) { > + if (val != 0 && val != 1) > + return -EINVAL; > + > + amdgpu_smu_debug = val; > + return 0; > +} > + > +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug, > + amdgpu_debugfs_smu_debug_get, > + amdgpu_debugfs_smu_debug_set, > + "%llu\n"); > + > int amdgpu_debugfs_init(struct amdgpu_device *adev) > { > struct dentry *root = adev_to_drm(adev)->primary->debugfs_root; > @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) > return PTR_ERR(ent); > } > > + ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev, > + &fops_smu_debug); > + if (IS_ERR(ent)) { > + DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n"); > + return PTR_ERR(ent); > + } > + > + > /* Register debugfs entries for amdgpu_ttm */ > amdgpu_ttm_debugfs_init(adev); > amdgpu_debugfs_pm_init(adev); > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > index 048ca1673863..b3969d7933d3 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > @@ -55,6 +55,14 @@ > > #undef __SMU_DUMMY_MAP > #define __SMU_DUMMY_MAP(type) #type > + > +/* > + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal. > + * This will aid in debugging SMU firmware issues. > + * (0 = disabled (default), 1 = enabled) */ int amdgpu_smu_debug; > + > static const char * const __smu_message_names[] = { > SMU_MESSAGE_TYPES > }; > @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > __smu_cmn_send_msg(smu, msg_index, param); > res = 0; > Out: > + if (unlikely(amdgpu_smu_debug == 1) && res) { > + mutex_unlock(&smu->message_lock); > + BUG(); > + } > + > return res; > } > > @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > int smu_cmn_wait_for_response(struct smu_context *smu) > { > u32 reg; > + int res; > > reg = __smu_cmn_poll_stat(smu); > - return __smu_cmn_reg2errno(smu, reg); > + res = __smu_cmn_reg2errno(smu, reg); > + > + if (unlikely(amdgpu_smu_debug == 1) && res) { > + mutex_unlock(&smu->message_lock); > + BUG(); > + } > + > + return res; > } > > /** > @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > uint32_t param, > uint32_t *read_arg) > { > + int retry_count = 0; > int res, index; > u32 reg; > > @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > __smu_cmn_reg_print_error(smu, reg, index, param, msg); > goto Out; > } > +retry: > __smu_cmn_send_msg(smu, (uint16_t) index, param); > reg = __smu_cmn_poll_stat(smu); > res = __smu_cmn_reg2errno(smu, reg); > - if (res != 0) > + if (res != 0) { > __smu_cmn_reg_print_error(smu, reg, index, param, msg); > + if ((res == -ETIME) && (retry_count++ < 1)) { > + usleep_range(500, 1000); > + dev_err(smu->adev->dev, > + "SMU: resend command: index:%d param:0x%08X message:%s", > + index, param, smu_get_message_name(smu, msg)); > + goto retry; > + } > + goto Out; > + } Sorry, what I meant is to have an extended wait time in debug mode. Something like below, not a 'full retry' as in sending the message again. +#define MAX_DBG_WAIT_CNT 3 + /** * __smu_cmn_poll_stat -- poll for a status from the SMU * smu: a pointer to SMU context @@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context *smu, static u32 __smu_cmn_poll_stat(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - int timeout = adev->usec_timeout * 20; + int timeout; u32 reg; + int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0; - for ( ; timeout > 0; timeout--) { - reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90); - if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0) - break; + do { + timeout = adev->usec_timeout * 20; + for (; timeout > 0; timeout--) { + reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90); + if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0) + break; - udelay(1); - } + udelay(1); + } + } while (extended_wait-- >= 0); + if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE) + dev_err(adev->dev, + "SMU: Unexpected extended wait for response"); return reg; } Thanks, Lijo > if (read_arg) > smu_cmn_read_arg(smu, read_arg); > Out: > mutex_unlock(&smu->message_lock); > + > + BUG_ON(unlikely(amdgpu_smu_debug == 1) && res); > + > return res; > } > >