On 3/5/2024 2:53 PM, Christian König wrote: > Am 01.03.24 um 13:43 schrieb Sunil Khatri: >> Add ring timeout related information in the amdgpu >> devcoredump file for debugging purposes. >> >> During the gpu recovery process the registered call >> is triggered and add the debug information in data >> file created by devcoredump framework under the >> directory /sys/class/devcoredump/devcdx/ >> >> Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 15 +++++++++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 11 +++++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +++++++++++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + >> 4 files changed, 38 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 9246bca0a008..9f57c7795c47 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -816,6 +816,17 @@ struct amdgpu_reset_info { >> #endif >> }; >> +/* >> + * IP and Queue information during timeout >> + */ >> +struct amdgpu_ring_timeout_info { >> + bool timeout; > > What should that be good for? > In case of page fault or gpu reset due to other reasons there is no > time out. In that case we are not adding any information and we are > using this flag while dumping information. > >> + char ring_name[32]; >> + enum amdgpu_ring_type ip_type; > > Those information should already be available in the core dump. Will update. > >> + bool soft_recovery; > > That doesn't make sense since we don't do a core dump in case of a > soft recovery. Noted, this can be removed. > >> +}; >> + >> + >> /* >> * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise. >> */ >> @@ -1150,6 +1161,10 @@ struct amdgpu_device { >> bool debug_largebar; >> bool debug_disable_soft_recovery; >> bool debug_use_vram_fw_buf; >> + >> +#ifdef CONFIG_DEV_COREDUMP >> + struct amdgpu_ring_timeout_info ring_timeout_info; >> +#endif > > Please never store core dump related information in the amdgpu_device > structure. Let me see to it. Point taken Thanks Sunil > > Regards, > Christian. > >> }; >> static inline uint32_t amdgpu_ip_version(const struct >> amdgpu_device *adev, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> index 71a5cf37b472..e36b7352b2de 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> @@ -51,8 +51,19 @@ static enum drm_gpu_sched_stat >> amdgpu_job_timedout(struct drm_sched_job *s_job) >> memset(&ti, 0, sizeof(struct amdgpu_task_info)); >> adev->job_hang = true; >> +#ifdef CONFIG_DEV_COREDUMP >> + /* Update the ring timeout info for coredump*/ >> + adev->ring_timeout_info.timeout = TRUE; >> + sprintf(adev->ring_timeout_info.ring_name, s_job->sched->name); >> + adev->ring_timeout_info.ip_type = ring->funcs->type; >> + adev->ring_timeout_info.soft_recovery = FALSE; >> +#endif >> + >> if (amdgpu_gpu_recovery && >> amdgpu_ring_soft_recovery(ring, job->vmid, >> s_job->s_fence->parent)) { >> +#ifdef CONFIG_DEV_COREDUMP >> + adev->ring_timeout_info.soft_recovery = TRUE; >> +#endif >> DRM_ERROR("ring %s timeout, but soft recovered\n", >> s_job->sched->name); >> goto exit; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> index 4baa300121d8..d4f892ed105f 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> @@ -196,8 +196,16 @@ amdgpu_devcoredump_read(char *buffer, loff_t >> offset, size_t count, >> coredump->reset_task_info.process_name, >> coredump->reset_task_info.pid); >> + if (coredump->timeout_info.timeout) { >> + drm_printf(&p, "\nRing timed out details\n"); >> + drm_printf(&p, "IP Type: %d Ring Name: %s Soft Recovery: %s\n", >> + coredump->timeout_info.ip_type, >> + coredump->timeout_info.ring_name, >> + coredump->timeout_info.soft_recovery ? >> "Successful":"Failed"); >> + } >> + >> if (coredump->reset_vram_lost) >> - drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >> + drm_printf(&p, "\nVRAM is lost due to GPU reset!\n"); >> if (coredump->adev->reset_info.num_regs) { >> drm_printf(&p, "AMDGPU register dumps:\nOffset: >> Value:\n"); >> @@ -228,6 +236,7 @@ void amdgpu_coredump(struct amdgpu_device >> *adev, bool vram_lost, >> return; >> } >> + coredump->timeout_info = adev->ring_timeout_info; >> coredump->reset_vram_lost = vram_lost; >> if (reset_context->job && reset_context->job->vm) >> @@ -236,6 +245,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, >> bool vram_lost, >> coredump->adev = adev; >> ktime_get_ts64(&coredump->reset_time); >> + memset(&adev->ring_timeout_info, 0, >> sizeof(adev->ring_timeout_info)); >> dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, >> amdgpu_devcoredump_read, amdgpu_devcoredump_free); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> index 19899f6b9b2b..37172d6e1f94 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> @@ -97,6 +97,7 @@ struct amdgpu_coredump_info { >> struct amdgpu_task_info reset_task_info; >> struct timespec64 reset_time; >> bool reset_vram_lost; >> + struct amdgpu_ring_timeout_info timeout_info; >> }; >> #endif >