Am 24.01.22 um 17:45 schrieb Sharma, Shashank:
Hello Christian, Thank for your comments, please fine mine inline: On 1/24/2022 8:15 AM, Christian König wrote:Am 21.01.22 um 21:34 schrieb Sharma, Shashank:From 1c5c552eeddaffd9fb3e7d45ece1b2b28fccc575 Mon Sep 17 00:00:00 2001 From: Somalapuram Amaranath <Amaranath.Somalapuram@xxxxxxx> Date: Fri, 21 Jan 2022 14:19:10 +0530Subject: [PATCH 3/4] drm/amdgpu: add reset register trace dump function forgfx_v10_0 Implementation of register trace dump function on the AMD GPU resets Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 8 ++++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 53 ++++++++++++++++++++++- drivers/gpu/drm/amd/include/amd_shared.h | 1 + 3 files changed, 60 insertions(+), 2 deletions(-)diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.hindex d855cb53c7e0..c97b53b54333 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -537,6 +537,14 @@ TRACE_EVENT(amdgpu_ib_pipe_sync, __entry->seqno) ); +TRACE_EVENT(gfx_v10_0_reset_reg_dumps, + TP_PROTO(char *reg_dumps), + TP_ARGS(reg_dumps), + TP_STRUCT__entry(__string(dumps, reg_dumps)), + TP_fast_assign(__assign_str(dumps, reg_dumps);), + TP_printk("amdgpu register dump {%s}", __get_str(dumps)) +); + #undef AMDGPU_JOB_GET_TIMELINE_NAME #endifdiff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.cindex 16dbe593cba2..05974ed5416d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -46,7 +46,7 @@ #include "v10_structs.h" #include "gfx_v10_0.h" #include "nbio_v2_3.h" - +#include "amdgpu_trace.h" /* * Navi10 has two graphic rings to share each graphic pipe. * 1. Primary ring @@ -188,6 +188,12 @@ #define RLCG_ERROR_REPORT_ENABLED(adev) \(amdgpu_sriov_reg_indirect_mmhub(adev) || amdgpu_sriov_reg_indirect_gc(adev))+#define N_REGS (17) +#define DUMP_REG(addr) do { \ + (dump)[i][0] = (addr); \ + (dump)[i++][1] = RREG32_SOC15_IP(GC, addr); \ + } while (0)Those macros need an AMDGPU_ prefix and maybe some better names.Agree, @Amar pls check this out.From the design POV I'm really wondering if it wouldn't be better if userspace defines the registers we want to dump in case of a crash.I am not so sure about it. This means we have to communicate with the userspace and get the list of registers it wishes to see, do I smell a new IOCTL or DRM property ? I also feel that we have better filtering tools for a trace event in userspace than in kernel.What do you think ?
Just a writeable debugfs file should probably do it. We have a list of known registers filled in during boot (just reg offsets) and userspace can read/write the file to update it.
Christian.
- ShashankRegards, Christian.+ MODULE_FIRMWARE("amdgpu/navi10_ce.bin"); MODULE_FIRMWARE("amdgpu/navi10_pfp.bin"); MODULE_FIRMWARE("amdgpu/navi10_me.bin"); @@ -7488,7 +7494,6 @@ static int gfx_v10_0_hw_init(void *handle) { int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (!amdgpu_emu_mode) gfx_v10_0_init_golden_registers(adev); @@ -7602,6 +7607,49 @@ static int gfx_v10_0_hw_fini(void *handle) return 0; } +static int gfx_v10_0_reset_reg_dumps(void *handle) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)handle; + uint32_t i = 0; + uint32_t (*dump)[2], n_regs = 0; + char *reg_dumps; + + dump = kmalloc(N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + reg_dumps = kmalloc(1024, GFP_KERNEL); + + if (dump == NULL || reg_dumps == NULL) + return -ENOMEM; + + DUMP_REG(mmGRBM_STATUS2); + DUMP_REG(mmGRBM_STATUS_SE0); + DUMP_REG(mmGRBM_STATUS_SE1); + DUMP_REG(mmGRBM_STATUS_SE2); + DUMP_REG(mmGRBM_STATUS_SE3); + DUMP_REG(mmSDMA0_STATUS_REG); + DUMP_REG(mmSDMA1_STATUS_REG); + DUMP_REG(mmCP_STAT); + DUMP_REG(mmCP_STALLED_STAT1); + DUMP_REG(mmCP_STALLED_STAT1); + DUMP_REG(mmCP_STALLED_STAT3); + DUMP_REG(mmCP_CPC_STATUS); + DUMP_REG(mmCP_CPC_BUSY_STAT); + DUMP_REG(mmCP_CPC_STALLED_STAT1); + DUMP_REG(mmCP_CPF_STATUS); + DUMP_REG(mmCP_CPF_BUSY_STAT); + DUMP_REG(mmCP_CPF_STALLED_STAT1); + + n_regs = i; + + for (i = 0; i < n_regs; i++)+ sprintf(reg_dumps + strlen(reg_dumps), "%08x: %08x, ", dump[i][0], dump[i][1]);+ + trace_gfx_v10_0_reset_reg_dumps(reg_dumps); + kfree(dump); + kfree(reg_dumps); + + return 0; +} + static int gfx_v10_0_suspend(void *handle) { return gfx_v10_0_hw_fini(handle);@@ -9367,6 +9415,7 @@ static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {.set_clockgating_state = gfx_v10_0_set_clockgating_state, .set_powergating_state = gfx_v10_0_set_powergating_state, .get_clockgating_state = gfx_v10_0_get_clockgating_state, + .reset_reg_dumps = gfx_v10_0_reset_reg_dumps, }; static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.hindex 257f280d3d53..76d3a55278df 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -296,6 +296,7 @@ struct amd_ip_funcs { enum amd_powergating_state state); void (*get_clockgating_state)(void *handle, u32 *flags);int (*enable_umd_pstate)(void *handle, enum amd_dpm_forced_level *level);+ int (*reset_reg_dumps)(void *handle); };