To maintain system error state when SMU errors occurred, which will aid in debugging SMU firmware issues, add SMU debug option support. It can be enabled or disabled via amdgpu_smu_debug debugfs file. When enabled, it makes SMU errors fatal. It is disabled by default. == Command Guide == 1, enable SMU debug option # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug 2, disable SMU debug option # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug v2: - Resend command when timeout.(Lijo) - Use debugfs file instead of module parameter. Signed-off-by: Lang Yu <lang.yu@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++ drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 39 +++++++++++++++++++-- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 164d6a9e9fbb..f9412de86599 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -39,6 +39,8 @@ #if defined(CONFIG_DEBUG_FS) +extern int amdgpu_smu_debug; + /** * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes * @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf, return result; } + + static const struct file_operations amdgpu_debugfs_regs2_fops = { .owner = THIS_MODULE, .unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL, DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL, amdgpu_debugfs_sclk_set, "%llu\n"); +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) +{ + *val = amdgpu_smu_debug; + return 0; +} + +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) +{ + if (val != 0 && val != 1) + return -EINVAL; + + amdgpu_smu_debug = val; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug, + amdgpu_debugfs_smu_debug_get, + amdgpu_debugfs_smu_debug_set, + "%llu\n"); + int amdgpu_debugfs_init(struct amdgpu_device *adev) { struct dentry *root = adev_to_drm(adev)->primary->debugfs_root; @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) return PTR_ERR(ent); } + ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev, + &fops_smu_debug); + if (IS_ERR(ent)) { + DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n"); + return PTR_ERR(ent); + } + + /* Register debugfs entries for amdgpu_ttm */ amdgpu_ttm_debugfs_init(adev); amdgpu_debugfs_pm_init(adev); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index 048ca1673863..b3969d7933d3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -55,6 +55,14 @@ #undef __SMU_DUMMY_MAP #define __SMU_DUMMY_MAP(type) #type + +/* + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal. + * This will aid in debugging SMU firmware issues. + * (0 = disabled (default), 1 = enabled) + */ +int amdgpu_smu_debug; + static const char * const __smu_message_names[] = { SMU_MESSAGE_TYPES }; @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, __smu_cmn_send_msg(smu, msg_index, param); res = 0; Out: + if (unlikely(amdgpu_smu_debug == 1) && res) { + mutex_unlock(&smu->message_lock); + BUG(); + } + return res; } @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, int smu_cmn_wait_for_response(struct smu_context *smu) { u32 reg; + int res; reg = __smu_cmn_poll_stat(smu); - return __smu_cmn_reg2errno(smu, reg); + res = __smu_cmn_reg2errno(smu, reg); + + if (unlikely(amdgpu_smu_debug == 1) && res) { + mutex_unlock(&smu->message_lock); + BUG(); + } + + return res; } /** @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, uint32_t param, uint32_t *read_arg) { + int retry_count = 0; int res, index; u32 reg; @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, __smu_cmn_reg_print_error(smu, reg, index, param, msg); goto Out; } +retry: __smu_cmn_send_msg(smu, (uint16_t) index, param); reg = __smu_cmn_poll_stat(smu); res = __smu_cmn_reg2errno(smu, reg); - if (res != 0) + if (res != 0) { __smu_cmn_reg_print_error(smu, reg, index, param, msg); + if ((res == -ETIME) && (retry_count++ < 1)) { + usleep_range(500, 1000); + dev_err(smu->adev->dev, + "SMU: resend command: index:%d param:0x%08X message:%s", + index, param, smu_get_message_name(smu, msg)); + goto retry; + } + goto Out; + } if (read_arg) smu_cmn_read_arg(smu, read_arg); Out: mutex_unlock(&smu->message_lock); + + BUG_ON(unlikely(amdgpu_smu_debug == 1) && res); + return res; } -- 2.25.1