Add ring timeout related information in the amdgpu devcoredump file for debugging purposes. During the gpu recovery process the registered call is triggered and add the debug information in data file created by devcoredump framework under the directory /sys/class/devcoredump/devcdx/ Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 15 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 11 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9246bca0a008..9f57c7795c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -816,6 +816,17 @@ struct amdgpu_reset_info { #endif }; +/* + * IP and Queue information during timeout + */ +struct amdgpu_ring_timeout_info { + bool timeout; + char ring_name[32]; + enum amdgpu_ring_type ip_type; + bool soft_recovery; +}; + + /* * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise. */ @@ -1150,6 +1161,10 @@ struct amdgpu_device { bool debug_largebar; bool debug_disable_soft_recovery; bool debug_use_vram_fw_buf; + +#ifdef CONFIG_DEV_COREDUMP + struct amdgpu_ring_timeout_info ring_timeout_info; +#endif }; static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 71a5cf37b472..e36b7352b2de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -51,8 +51,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) memset(&ti, 0, sizeof(struct amdgpu_task_info)); adev->job_hang = true; +#ifdef CONFIG_DEV_COREDUMP + /* Update the ring timeout info for coredump*/ + adev->ring_timeout_info.timeout = TRUE; + sprintf(adev->ring_timeout_info.ring_name, s_job->sched->name); + adev->ring_timeout_info.ip_type = ring->funcs->type; + adev->ring_timeout_info.soft_recovery = FALSE; +#endif + if (amdgpu_gpu_recovery && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { +#ifdef CONFIG_DEV_COREDUMP + adev->ring_timeout_info.soft_recovery = TRUE; +#endif DRM_ERROR("ring %s timeout, but soft recovered\n", s_job->sched->name); goto exit; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 4baa300121d8..d4f892ed105f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -196,8 +196,16 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, coredump->reset_task_info.process_name, coredump->reset_task_info.pid); + if (coredump->timeout_info.timeout) { + drm_printf(&p, "\nRing timed out details\n"); + drm_printf(&p, "IP Type: %d Ring Name: %s Soft Recovery: %s\n", + coredump->timeout_info.ip_type, + coredump->timeout_info.ring_name, + coredump->timeout_info.soft_recovery ? "Successful":"Failed"); + } + if (coredump->reset_vram_lost) - drm_printf(&p, "VRAM is lost due to GPU reset!\n"); + drm_printf(&p, "\nVRAM is lost due to GPU reset!\n"); if (coredump->adev->reset_info.num_regs) { drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); @@ -228,6 +236,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, return; } + coredump->timeout_info = adev->ring_timeout_info; coredump->reset_vram_lost = vram_lost; if (reset_context->job && reset_context->job->vm) @@ -236,6 +245,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, coredump->adev = adev; ktime_get_ts64(&coredump->reset_time); + memset(&adev->ring_timeout_info, 0, sizeof(adev->ring_timeout_info)); dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, amdgpu_devcoredump_read, amdgpu_devcoredump_free); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 19899f6b9b2b..37172d6e1f94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -97,6 +97,7 @@ struct amdgpu_coredump_info { struct amdgpu_task_info reset_task_info; struct timespec64 reset_time; bool reset_vram_lost; + struct amdgpu_ring_timeout_info timeout_info; }; #endif -- 2.34.1