Re: [PATCH] drm/amdgpu: add SMU debug option support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 12/1/2021 11:57 AM, Yu, Lang wrote:
[AMD Official Use Only]

Hi Lijo,

Thanks for your comments.
From my understanding, that just increases the timeout threshold and
could hide some potential issues which should be exposed and solved.

If current timeout threshold is not enough for some corner cases,
(1) Do we consider to increase the threshold to cover these cases?
(2) Or do we just expose them and request SMU FW to optimize them?

I think it doesn't make much sense to increase the threshold in debug mode.
How do you think? Thanks!

In normal cases, 2secs would be more than enough. If we hang immediately, then check the FW registers later, the response would have come. I thought we just need to note those cases and not to fail everytime. Just to mark as a red flag in the log to tell us that FW is unexpectedly busy processing something else when the message is sent.

There are some issues related to S0ix where we see the FW comes back with a response with an increased timeout under certain conditions.

Thanks,
Lijo


Regards,
Lang

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
Sent: Wednesday, December 1, 2021 1:44 PM
To: Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Yu, Lang <Lang.Yu@xxxxxxx>; amd-
gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Huang, Ray
<Ray.Huang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>
Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support

Just realized that the patch I pasted won't work. Outer loop exit needs to be like
this.
	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
0

Anyway, that patch is only there to communicate what I really meant in the
earlier comment.

Thanks,
Lijo

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lazar,
Lijo
Sent: Wednesday, December 1, 2021 10:44 AM
To: Yu, Lang <Lang.Yu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Huang, Ray
<Ray.Huang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>
Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support



On 11/30/2021 10:47 AM, Lang Yu wrote:
To maintain system error state when SMU errors occurred, which will
aid in debugging SMU firmware issues, add SMU debug option support.

It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
enabled, it makes SMU errors fatal.
It is disabled by default.

== Command Guide ==

1, enable SMU debug option

   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

2, disable SMU debug option

   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

v2:
   - Resend command when timeout.(Lijo)
   - Use debugfs file instead of module parameter.

Signed-off-by: Lang Yu <lang.yu@xxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++-
-
   2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..f9412de86599 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -39,6 +39,8 @@

   #if defined(CONFIG_DEBUG_FS)

+extern int amdgpu_smu_debug;
+
   /**
    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
    *
@@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct
file *f, char __user *buf,
   	return result;
   }

+
+
   static const struct file_operations amdgpu_debugfs_regs2_fops = {
   	.owner = THIS_MODULE,
   	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26
@@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
   			amdgpu_debugfs_sclk_set, "%llu\n");

+static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
+	*val = amdgpu_smu_debug;
+	return 0;
+}
+
+static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	amdgpu_smu_debug = val;
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
+			 amdgpu_debugfs_smu_debug_get,
+			 amdgpu_debugfs_smu_debug_set,
+			 "%llu\n");
+
   int amdgpu_debugfs_init(struct amdgpu_device *adev)
   {
   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
*adev)
   		return PTR_ERR(ent);
   	}

+	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
+				  &fops_smu_debug);
+	if (IS_ERR(ent)) {
+		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
file\n");
+		return PTR_ERR(ent);
+	}
+
+
   	/* Register debugfs entries for amdgpu_ttm */
   	amdgpu_ttm_debugfs_init(adev);
   	amdgpu_debugfs_pm_init(adev);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 048ca1673863..b3969d7933d3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -55,6 +55,14 @@

   #undef __SMU_DUMMY_MAP
   #define __SMU_DUMMY_MAP(type)	#type
+
+/*
+ * Used to enable SMU debug option. When enabled, it makes SMU errors
fatal.
+ * This will aid in debugging SMU firmware issues.
+ * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
+
   static const char * const __smu_message_names[] = {
   	SMU_MESSAGE_TYPES
   };
@@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
smu_context *smu,
   	__smu_cmn_send_msg(smu, msg_index, param);
   	res = 0;
   Out:
+	if (unlikely(amdgpu_smu_debug == 1) && res) {
+		mutex_unlock(&smu->message_lock);
+		BUG();
+	}
+
   	return res;
   }

@@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
smu_context *smu,
   int smu_cmn_wait_for_response(struct smu_context *smu)
   {
   	u32 reg;
+	int res;

   	reg = __smu_cmn_poll_stat(smu);
-	return __smu_cmn_reg2errno(smu, reg);
+	res = __smu_cmn_reg2errno(smu, reg);
+
+	if (unlikely(amdgpu_smu_debug == 1) && res) {
+		mutex_unlock(&smu->message_lock);
+		BUG();
+	}
+
+	return res;
   }

   /**
@@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
smu_context *smu,
   				    uint32_t param,
   				    uint32_t *read_arg)
   {
+	int retry_count = 0;
   	int res, index;
   	u32 reg;

@@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct
smu_context *smu,
   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
   		goto Out;
   	}
+retry:
   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
   	reg = __smu_cmn_poll_stat(smu);
   	res = __smu_cmn_reg2errno(smu, reg);
-	if (res != 0)
+	if (res != 0) {
   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
+		if ((res == -ETIME) && (retry_count++ < 1)) {
+			usleep_range(500, 1000);
+			dev_err(smu->adev->dev,
+				"SMU: resend command: index:%d
param:0x%08X message:%s",
+				index, param, smu_get_message_name(smu,
msg));
+			goto retry;
+		}
+		goto Out;
+	}

Sorry, what I meant is to have an extended wait time in debug mode.
Something like below, not a 'full retry' as in sending the message again.


+#define MAX_DBG_WAIT_CNT 3
+
  /**
   * __smu_cmn_poll_stat -- poll for a status from the SMU
   * smu: a pointer to SMU context
@@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context
*smu,
  static u32 __smu_cmn_poll_stat(struct smu_context *smu)
  {
         struct amdgpu_device *adev = smu->adev;
-       int timeout = adev->usec_timeout * 20;
+       int timeout;
         u32 reg;
+       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;

-       for ( ; timeout > 0; timeout--) {
-               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
-               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
-                       break;
+       do {
+               timeout = adev->usec_timeout * 20;
+               for (; timeout > 0; timeout--) {
+                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
+                               break;

-               udelay(1);
-       }
+                       udelay(1);
+               }
+       } while (extended_wait-- >= 0);

+       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
+               dev_err(adev->dev,
+                       "SMU: Unexpected extended wait for response");
         return reg;
  }

Thanks,
Lijo

   	if (read_arg)
   		smu_cmn_read_arg(smu, read_arg);
   Out:
   	mutex_unlock(&smu->message_lock);
+
+	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
+
   	return res;
   }





[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux