On Wed, Apr 17, 2024 at 12:24 PM Lazar, Lijo <Lijo.Lazar@xxxxxxx> wrote: > > [AMD Official Use Only - General] > > Yes, right now that API doesn't return anything. What I meant is to add that check as well as coredump API is essentially used in hang situations. > > Old times, access to registers while in GFXOFF resulted in system hang (basically it won't go beyond this point). If that happens, then the purpose of the patch - to get the context of a device hang - is lost. We may not even get a proper dmesg log. Maybe add a call to amdgpu_get_gfx_off_status(), but unfortunately, it's not implemented on every chip yet. Alex > > Thanks, > Lijo > -----Original Message----- > From: Khatri, Sunil <Sunil.Khatri@xxxxxxx> > Sent: Wednesday, April 17, 2024 9:42 PM > To: Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Alex Deucher <alexdeucher@xxxxxxxxx>; Khatri, Sunil <Sunil.Khatri@xxxxxxx> > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: Re: [PATCH v5 2/6] drm/amdgpu: add support of gfx10 register dump > > > On 4/17/2024 9:31 PM, Lazar, Lijo wrote: > > > > On 4/17/2024 9:21 PM, Alex Deucher wrote: > >> On Wed, Apr 17, 2024 at 5:38 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote: > >>> Adding gfx10 gc registers to be used for register dump via > >>> devcoredump during a gpu reset. > >>> > >>> Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> > >> Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > >> > >>> --- > >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 ++ > >>> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 + > >>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 130 +++++++++++++++++- > >>> drivers/gpu/drm/amd/amdgpu/soc15.h | 2 + > >>> .../include/asic_reg/gc/gc_10_1_0_offset.h | 12 ++ > >>> 5 files changed, 155 insertions(+), 1 deletion(-) > >>> > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> index e0d7f4ee7e16..cac0ca64367b 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> @@ -139,6 +139,14 @@ enum amdgpu_ss { > >>> AMDGPU_SS_DRV_UNLOAD > >>> }; > >>> > >>> +struct amdgpu_hwip_reg_entry { > >>> + u32 hwip; > >>> + u32 inst; > >>> + u32 seg; > >>> + u32 reg_offset; > >>> + const char *reg_name; > >>> +}; > >>> + > >>> struct amdgpu_watchdog_timer { > >>> bool timeout_fatal_disable; > >>> uint32_t period; /* maxCycles = (1 << period), the number > >>> of cycles before a timeout */ diff --git > >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > >>> index 04a86dff71e6..64f197bbc866 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > >>> @@ -433,6 +433,10 @@ struct amdgpu_gfx { > >>> uint32_t num_xcc_per_xcp; > >>> struct mutex partition_mutex; > >>> bool mcbp; /* mid command buffer preemption */ > >>> + > >>> + /* IP reg dump */ > >>> + uint32_t *ip_dump; > >>> + uint32_t reg_count; > >>> }; > >>> > >>> struct amdgpu_gfx_ras_reg_entry { > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >>> index a0bc4196ff8b..4a54161f4837 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >>> @@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin"); > >>> MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin"); > >>> MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin"); > >>> > >>> +static const struct amdgpu_hwip_reg_entry gc_reg_list_10_1[] = { > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS3), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_HPD_STATUS0), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS_2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL0_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL0_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmSQG_UTCL0_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL0_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_CNTL), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_3), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_4), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_GPM_STAT_2), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SPP_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_A), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_B), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR), > >>> + SOC15_REG_ENTRY_STR(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST) }; > >>> + > >>> static const struct soc15_reg_golden golden_settings_gc_10_1[] = { > >>> SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0xffffffff, 0x00400014), > >>> SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, > >>> 0xfcff8fff, 0xf8000100), @@ -4490,6 +4583,22 @@ static int gfx_v10_0_compute_ring_init(struct amdgpu_device *adev, int ring_id, > >>> hw_prio, NULL); > >>> } > >>> > >>> +static void gfx_v10_0_alloc_dump_mem(struct amdgpu_device *adev) { > >>> + uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1); > >>> + uint32_t *ptr; > >>> + > >>> + ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL); > >>> + if (ptr == NULL) { > >>> + DRM_ERROR("Failed to allocate memory for IP Dump\n"); > >>> + adev->gfx.ip_dump = NULL; > >>> + adev->gfx.reg_count = 0; > >>> + } else { > >>> + adev->gfx.ip_dump = ptr; > >>> + adev->gfx.reg_count = reg_count; > >>> + } > >>> +} > >>> + > >>> static int gfx_v10_0_sw_init(void *handle) > >>> { > >>> int i, j, k, r, ring_id = 0; @@ -4642,6 +4751,8 @@ static > >>> int gfx_v10_0_sw_init(void *handle) > >>> > >>> gfx_v10_0_gpu_early_init(adev); > >>> > >>> + gfx_v10_0_alloc_dump_mem(adev); > >>> + > >>> return 0; > >>> } > >>> > >>> @@ -4694,6 +4805,8 @@ static int gfx_v10_0_sw_fini(void *handle) > >>> > >>> gfx_v10_0_free_microcode(adev); > >>> > >>> + kfree(adev->gfx.ip_dump); > >>> + > >>> return 0; > >>> } > >>> > >>> @@ -9154,6 +9267,21 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring) > >>> amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */ > >>> } > >>> > >>> +static void gfx_v10_ip_dump(void *handle) { > >>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle; > >>> + uint32_t i; > >>> + uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1); > >>> + > >>> + if (!adev->gfx.ip_dump) > >>> + return; > >>> + > >>> + amdgpu_gfx_off_ctrl(adev, false); > >>> + for (i = 0; i < reg_count; i++) > >>> + adev->gfx.ip_dump[i] = > >>> + RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i])); > > This may cause a bigger hang, if PMFW is already hung (in that case > > reset also won't work, but a dump is still useful). To be on the safer > > side, there should be some sort of return value for gfx_off which > > should be checked. > > amdgpu_gfx_off_ctrl doesnt return any value to confirm if it worked or not else would have checked return value. > I guess assumption is that it will work until there is a bigger hang as you mentioned and i guess in that case nothing will work and needs a reset only. > > Regards > Sunil Khatri > > > > > Thanks, > > Lijo > > > >>> + amdgpu_gfx_off_ctrl(adev, true); } > >>> + > >>> static const struct amd_ip_funcs gfx_v10_0_ip_funcs = { > >>> .name = "gfx_v10_0", > >>> .early_init = gfx_v10_0_early_init, @@ -9170,7 +9298,7 @@ > >>> static const struct amd_ip_funcs gfx_v10_0_ip_funcs = { > >>> .set_clockgating_state = gfx_v10_0_set_clockgating_state, > >>> .set_powergating_state = gfx_v10_0_set_powergating_state, > >>> .get_clockgating_state = gfx_v10_0_get_clockgating_state, > >>> - .dump_ip_state = NULL, > >>> + .dump_ip_state = gfx_v10_ip_dump, > >>> }; > >>> > >>> static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h > >>> b/drivers/gpu/drm/amd/amdgpu/soc15.h > >>> index 1444b7765e4b..282584a48be0 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h > >>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h > >>> @@ -88,6 +88,8 @@ struct soc15_ras_field_entry { > >>> }; > >>> > >>> #define SOC15_REG_ENTRY(ip, inst, reg) ip##_HWIP, inst, > >>> reg##_BASE_IDX, reg > >>> +#define SOC15_REG_ENTRY_STR(ip, inst, reg) \ > >>> + { ip##_HWIP, inst, reg##_BASE_IDX, reg, #reg } > >>> > >>> #define SOC15_REG_ENTRY_OFFSET(entry) > >>> (adev->reg_offset[entry.hwip][entry.inst][entry.seg] + > >>> entry.reg_offset) > >>> > >>> diff --git > >>> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >>> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >>> index 4908044f7409..4c8e7fdb6976 100644 > >>> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >>> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >>> @@ -4830,6 +4830,8 @@ > >>> #define mmCP_ECC_FIRSTOCCURRENCE_RING2_BASE_IDX 0 > >>> #define mmGB_EDC_MODE 0x1e1e > >>> #define mmGB_EDC_MODE_BASE_IDX 0 > >>> +#define mmCP_DEBUG 0x1e1f > >>> +#define mmCP_DEBUG_BASE_IDX 0 > >>> #define mmCP_FETCHER_SOURCE 0x1e22 > >>> #define mmCP_FETCHER_SOURCE_BASE_IDX 0 > >>> #define mmCP_PQ_WPTR_POLL_CNTL 0x1e23 > >>> @@ -7778,6 +7780,8 @@ > >>> #define mmCP_MES_DOORBELL_CONTROL5_BASE_IDX 1 > >>> #define mmCP_MES_DOORBELL_CONTROL6 0x2841 > >>> #define mmCP_MES_DOORBELL_CONTROL6_BASE_IDX 1 > >>> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR 0x2842 > >>> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR_BASE_IDX 1 > >>> #define mmCP_MES_GP0_LO 0x2843 > >>> #define mmCP_MES_GP0_LO_BASE_IDX 1 > >>> #define mmCP_MES_GP0_HI 0x2844 > >>> @@ -9332,10 +9336,16 @@ > >>> #define mmRLC_LB_CNTR_INIT_1_BASE_IDX 1 > >>> #define mmRLC_LB_CNTR_1 0x4c1c > >>> #define mmRLC_LB_CNTR_1_BASE_IDX 1 > >>> +#define mmRLC_GPM_DEBUG_INST_ADDR 0x4c1d > >>> +#define mmRLC_GPM_DEBUG_INST_ADDR_BASE_IDX 1 > >>> #define mmRLC_JUMP_TABLE_RESTORE 0x4c1e > >>> #define mmRLC_JUMP_TABLE_RESTORE_BASE_IDX 1 > >>> #define mmRLC_PG_DELAY_2 0x4c1f > >>> #define mmRLC_PG_DELAY_2_BASE_IDX 1 > >>> +#define mmRLC_GPM_DEBUG_INST_A 0x4c22 > >>> +#define mmRLC_GPM_DEBUG_INST_A_BASE_IDX 1 > >>> +#define mmRLC_GPM_DEBUG_INST_B 0x4c23 > >>> +#define mmRLC_GPM_DEBUG_INST_B_BASE_IDX 1 > >>> #define mmRLC_GPU_CLOCK_COUNT_LSB 0x4c24 > >>> #define mmRLC_GPU_CLOCK_COUNT_LSB_BASE_IDX 1 > >>> #define mmRLC_GPU_CLOCK_COUNT_MSB 0x4c25 > >>> @@ -9720,6 +9730,8 @@ > >>> #define mmRLC_SPM_THREAD_TRACE_CTRL_BASE_IDX 1 > >>> #define mmRLC_LB_CNTR_2 0x4de7 > >>> #define mmRLC_LB_CNTR_2_BASE_IDX 1 > >>> +#define mmRLC_LX6_CORE_PDEBUG_INST 0x4deb > >>> +#define mmRLC_LX6_CORE_PDEBUG_INST_BASE_IDX 1 > >>> #define mmRLC_CPAXI_DOORBELL_MON_CTRL 0x4df1 > >>> #define mmRLC_CPAXI_DOORBELL_MON_CTRL_BASE_IDX 1 > >>> #define mmRLC_CPAXI_DOORBELL_MON_STAT 0x4df2 > >>> -- > >>> 2.34.1 > >>>