On 11/30/ , Christian KKKnig wrote: > Am 30.11.21 um 06:17 schrieb Lang Yu: > > To maintain system error state when SMU errors occurred, > > which will aid in debugging SMU firmware issues, > > add SMU debug option support. > > > > It can be enabled or disabled via amdgpu_smu_debug > > debugfs file. When enabled, it makes SMU errors fatal. > > It is disabled by default. > > > > == Command Guide == > > > > 1, enable SMU debug option > > > > # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug > > > > 2, disable SMU debug option > > > > # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug > > > > v2: > > - Resend command when timeout.(Lijo) > > - Use debugfs file instead of module parameter. > > > > Signed-off-by: Lang Yu <lang.yu@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++ > > drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 39 +++++++++++++++++++-- > > 2 files changed, 69 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > index 164d6a9e9fbb..f9412de86599 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > @@ -39,6 +39,8 @@ > > #if defined(CONFIG_DEBUG_FS) > > +extern int amdgpu_smu_debug; > > + > > /** > > * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes > > * > > @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf, > > return result; > > } > > + > > + > > Unrelated change. Will remove it. > > static const struct file_operations amdgpu_debugfs_regs2_fops = { > > .owner = THIS_MODULE, > > .unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, > > @@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL, > > DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL, > > amdgpu_debugfs_sclk_set, "%llu\n"); > > +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) > > +{ > > + *val = amdgpu_smu_debug; > > + return 0; > > +} > > + > > +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) > > +{ > > + if (val != 0 && val != 1) > > + return -EINVAL; > > + > > + amdgpu_smu_debug = val; > > + return 0; > > +} > > + > > +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug, > > + amdgpu_debugfs_smu_debug_get, > > + amdgpu_debugfs_smu_debug_set, > > + "%llu\n"); > > + > > That can be done much simpler. Take a look at the debugfs_create_bool() > function. Thanks for your advice. Will modify that. > > int amdgpu_debugfs_init(struct amdgpu_device *adev) > > { > > struct dentry *root = adev_to_drm(adev)->primary->debugfs_root; > > @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) > > return PTR_ERR(ent); > > } > > + ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev, > > + &fops_smu_debug); > > + if (IS_ERR(ent)) { > > + DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n"); > > + return PTR_ERR(ent); > > + } > > + > > + > > /* Register debugfs entries for amdgpu_ttm */ > > amdgpu_ttm_debugfs_init(adev); > > amdgpu_debugfs_pm_init(adev); > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > > index 048ca1673863..b3969d7933d3 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > > @@ -55,6 +55,14 @@ > > #undef __SMU_DUMMY_MAP > > #define __SMU_DUMMY_MAP(type) #type > > + > > +/* > > + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal. > > + * This will aid in debugging SMU firmware issues. > > + * (0 = disabled (default), 1 = enabled) > > + */ > > +int amdgpu_smu_debug; > > Probably better to put that into amdgpu_device or similar structure. Ok. Thanks for your advice. Regards, Lang > Regards, > Christian. > > > + > > static const char * const __smu_message_names[] = { > > SMU_MESSAGE_TYPES > > }; > > @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > > __smu_cmn_send_msg(smu, msg_index, param); > > res = 0; > > Out: > > + if (unlikely(amdgpu_smu_debug == 1) && res) { > > + mutex_unlock(&smu->message_lock); > > + BUG(); > > + } > > + > > return res; > > } > > @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, > > int smu_cmn_wait_for_response(struct smu_context *smu) > > { > > u32 reg; > > + int res; > > reg = __smu_cmn_poll_stat(smu); > > - return __smu_cmn_reg2errno(smu, reg); > > + res = __smu_cmn_reg2errno(smu, reg); > > + > > + if (unlikely(amdgpu_smu_debug == 1) && res) { > > + mutex_unlock(&smu->message_lock); > > + BUG(); > > + } > > + > > + return res; > > } > > /** > > @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > > uint32_t param, > > uint32_t *read_arg) > > { > > + int retry_count = 0; > > int res, index; > > u32 reg; > > @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > > __smu_cmn_reg_print_error(smu, reg, index, param, msg); > > goto Out; > > } > > +retry: > > __smu_cmn_send_msg(smu, (uint16_t) index, param); > > reg = __smu_cmn_poll_stat(smu); > > res = __smu_cmn_reg2errno(smu, reg); > > - if (res != 0) > > + if (res != 0) { > > __smu_cmn_reg_print_error(smu, reg, index, param, msg); > > + if ((res == -ETIME) && (retry_count++ < 1)) { > > + usleep_range(500, 1000); > > + dev_err(smu->adev->dev, > > + "SMU: resend command: index:%d param:0x%08X message:%s", > > + index, param, smu_get_message_name(smu, msg)); > > + goto retry; > > + } > > + goto Out; > > + } > > if (read_arg) > > smu_cmn_read_arg(smu, read_arg); > > Out: > > mutex_unlock(&smu->message_lock); > > + > > + BUG_ON(unlikely(amdgpu_smu_debug == 1) && res); > > + > > return res; > > } >