-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
Sent: Wednesday, December 1, 2021 1:44 PM
To: Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Yu, Lang <Lang.Yu@xxxxxxx>; amd-
gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Huang, Ray
<Ray.Huang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>
Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
Just realized that the patch I pasted won't work. Outer loop exit needs to be like
this.
(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
0
Anyway, that patch is only there to communicate what I really meant in the
earlier comment.
Thanks,
Lijo
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lazar,
Lijo
Sent: Wednesday, December 1, 2021 10:44 AM
To: Yu, Lang <Lang.Yu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Huang, Ray
<Ray.Huang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>
Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
On 11/30/2021 10:47 AM, Lang Yu wrote:
To maintain system error state when SMU errors occurred, which will
aid in debugging SMU firmware issues, add SMU debug option support.
It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
enabled, it makes SMU errors fatal.
It is disabled by default.
== Command Guide ==
1, enable SMU debug option
# echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
2, disable SMU debug option
# echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
v2:
- Resend command when timeout.(Lijo)
- Use debugfs file instead of module parameter.
Signed-off-by: Lang Yu <lang.yu@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 39 +++++++++++++++++++-
-
2 files changed, 69 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..f9412de86599 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -39,6 +39,8 @@
#if defined(CONFIG_DEBUG_FS)
+extern int amdgpu_smu_debug;
+
/**
* amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
*
@@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct
file *f, char __user *buf,
return result;
}
+
+
static const struct file_operations amdgpu_debugfs_regs2_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26
@@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
amdgpu_debugfs_sclk_set, "%llu\n");
+static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
+ *val = amdgpu_smu_debug;
+ return 0;
+}
+
+static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ amdgpu_smu_debug = val;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
+ amdgpu_debugfs_smu_debug_get,
+ amdgpu_debugfs_smu_debug_set,
+ "%llu\n");
+
int amdgpu_debugfs_init(struct amdgpu_device *adev)
{
struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
*adev)
return PTR_ERR(ent);
}
+ ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
+ &fops_smu_debug);
+ if (IS_ERR(ent)) {
+ DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
file\n");
+ return PTR_ERR(ent);
+ }
+
+
/* Register debugfs entries for amdgpu_ttm */
amdgpu_ttm_debugfs_init(adev);
amdgpu_debugfs_pm_init(adev);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 048ca1673863..b3969d7933d3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -55,6 +55,14 @@
#undef __SMU_DUMMY_MAP
#define __SMU_DUMMY_MAP(type) #type
+
+/*
+ * Used to enable SMU debug option. When enabled, it makes SMU errors
fatal.
+ * This will aid in debugging SMU firmware issues.
+ * (0 = disabled (default), 1 = enabled) */ int amdgpu_smu_debug;
+
static const char * const __smu_message_names[] = {
SMU_MESSAGE_TYPES
};
@@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
smu_context *smu,
__smu_cmn_send_msg(smu, msg_index, param);
res = 0;
Out:
+ if (unlikely(amdgpu_smu_debug == 1) && res) {
+ mutex_unlock(&smu->message_lock);
+ BUG();
+ }
+
return res;
}
@@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
smu_context *smu,
int smu_cmn_wait_for_response(struct smu_context *smu)
{
u32 reg;
+ int res;
reg = __smu_cmn_poll_stat(smu);
- return __smu_cmn_reg2errno(smu, reg);
+ res = __smu_cmn_reg2errno(smu, reg);
+
+ if (unlikely(amdgpu_smu_debug == 1) && res) {
+ mutex_unlock(&smu->message_lock);
+ BUG();
+ }
+
+ return res;
}
/**
@@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
smu_context *smu,
uint32_t param,
uint32_t *read_arg)
{
+ int retry_count = 0;
int res, index;
u32 reg;
@@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct
smu_context *smu,
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
goto Out;
}
+retry:
__smu_cmn_send_msg(smu, (uint16_t) index, param);
reg = __smu_cmn_poll_stat(smu);
res = __smu_cmn_reg2errno(smu, reg);
- if (res != 0)
+ if (res != 0) {
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
+ if ((res == -ETIME) && (retry_count++ < 1)) {
+ usleep_range(500, 1000);
+ dev_err(smu->adev->dev,
+ "SMU: resend command: index:%d
param:0x%08X message:%s",
+ index, param, smu_get_message_name(smu,
msg));
+ goto retry;
+ }
+ goto Out;
+ }
Sorry, what I meant is to have an extended wait time in debug mode.
Something like below, not a 'full retry' as in sending the message again.
+#define MAX_DBG_WAIT_CNT 3
+
/**
* __smu_cmn_poll_stat -- poll for a status from the SMU
* smu: a pointer to SMU context
@@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context
*smu,
static u32 __smu_cmn_poll_stat(struct smu_context *smu)
{
struct amdgpu_device *adev = smu->adev;
- int timeout = adev->usec_timeout * 20;
+ int timeout;
u32 reg;
+ int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
- for ( ; timeout > 0; timeout--) {
- reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
- if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
- break;
+ do {
+ timeout = adev->usec_timeout * 20;
+ for (; timeout > 0; timeout--) {
+ reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+ if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
+ break;
- udelay(1);
- }
+ udelay(1);
+ }
+ } while (extended_wait-- >= 0);
+ if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
+ dev_err(adev->dev,
+ "SMU: Unexpected extended wait for response");
return reg;
}
Thanks,
Lijo
if (read_arg)
smu_cmn_read_arg(smu, read_arg);
Out:
mutex_unlock(&smu->message_lock);
+
+ BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
+
return res;
}