Re: [PATCH] drm/amdgpu: add ring timeout information in devcoredump

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 01.03.24 um 13:43 schrieb Sunil Khatri:
Add ring timeout related information in the amdgpu
devcoredump file for debugging purposes.

During the gpu recovery process the registered call
is triggered and add the debug information in data
file created by devcoredump framework under the
directory /sys/class/devcoredump/devcdx/

Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h       | 15 +++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   | 11 +++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +++++++++++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9246bca0a008..9f57c7795c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -816,6 +816,17 @@ struct amdgpu_reset_info {
  #endif
  };
+/*
+ *  IP and Queue information during timeout
+ */
+struct amdgpu_ring_timeout_info {
+	bool timeout;

What should that be good for?

+	char ring_name[32];
+	enum amdgpu_ring_type ip_type;

Those information should already be available in the core dump.

+	bool soft_recovery;

That doesn't make sense since we don't do a core dump in case of a soft recovery.

+};
+
+
  /*
   * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
   */
@@ -1150,6 +1161,10 @@ struct amdgpu_device {
  	bool                            debug_largebar;
  	bool                            debug_disable_soft_recovery;
  	bool                            debug_use_vram_fw_buf;
+
+#ifdef CONFIG_DEV_COREDUMP
+	struct amdgpu_ring_timeout_info ring_timeout_info;
+#endif

Please never store core dump related information in the amdgpu_device structure.

Regards,
Christian.

  };
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 71a5cf37b472..e36b7352b2de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -51,8 +51,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
  	memset(&ti, 0, sizeof(struct amdgpu_task_info));
  	adev->job_hang = true;
+#ifdef CONFIG_DEV_COREDUMP
+	/* Update the ring timeout info for coredump*/
+	adev->ring_timeout_info.timeout = TRUE;
+	sprintf(adev->ring_timeout_info.ring_name, s_job->sched->name);
+	adev->ring_timeout_info.ip_type = ring->funcs->type;
+	adev->ring_timeout_info.soft_recovery = FALSE;
+#endif
+
  	if (amdgpu_gpu_recovery &&
  	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
+#ifdef CONFIG_DEV_COREDUMP
+		adev->ring_timeout_info.soft_recovery = TRUE;
+#endif
  		DRM_ERROR("ring %s timeout, but soft recovered\n",
  			  s_job->sched->name);
  		goto exit;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 4baa300121d8..d4f892ed105f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -196,8 +196,16 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
  			   coredump->reset_task_info.process_name,
  			   coredump->reset_task_info.pid);
+ if (coredump->timeout_info.timeout) {
+		drm_printf(&p, "\nRing timed out details\n");
+		drm_printf(&p, "IP Type: %d Ring Name: %s Soft Recovery: %s\n",
+				coredump->timeout_info.ip_type,
+				coredump->timeout_info.ring_name,
+				coredump->timeout_info.soft_recovery ? "Successful":"Failed");
+	}
+
  	if (coredump->reset_vram_lost)
-		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
+		drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
  	if (coredump->adev->reset_info.num_regs) {
  		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
@@ -228,6 +236,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
  		return;
  	}
+ coredump->timeout_info = adev->ring_timeout_info;
  	coredump->reset_vram_lost = vram_lost;
if (reset_context->job && reset_context->job->vm)
@@ -236,6 +245,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
  	coredump->adev = adev;
ktime_get_ts64(&coredump->reset_time);
+	memset(&adev->ring_timeout_info, 0, sizeof(adev->ring_timeout_info));
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
  		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 19899f6b9b2b..37172d6e1f94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -97,6 +97,7 @@ struct amdgpu_coredump_info {
  	struct amdgpu_task_info         reset_task_info;
  	struct timespec64               reset_time;
  	bool                            reset_vram_lost;
+	struct amdgpu_ring_timeout_info timeout_info;
  };
  #endif




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux