On Fri, Apr 12, 2024 at 1:05 PM Khatri, Sunil <sukhatri@xxxxxxx> wrote: > > > On 4/12/2024 8:50 PM, Alex Deucher wrote: > > On Fri, Apr 12, 2024 at 10:00 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote: > >> Adding initial set of registers for ipdump during > >> devcoredump starting with gfx10 gc registers. > >> > >> ip dump is triggered when gpu reset happens via > >> devcoredump and the memory is allocated by each > >> ip and is freed once the dump is complete by > >> devcoredump. > >> > >> Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 16 +++ > >> .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 22 +++ > > I would split this into two patches, one to add the core > > infrastructure in devcoredump and one to add gfx10 support. The core > > support could be squashed into patch 1 as well. > Sure > > > > > >> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 127 +++++++++++++++++- > >> .../include/asic_reg/gc/gc_10_1_0_offset.h | 12 ++ > >> 4 files changed, 176 insertions(+), 1 deletion(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> index 65c17c59c152..e173ad86a241 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> @@ -139,6 +139,18 @@ enum amdgpu_ss { > >> AMDGPU_SS_DRV_UNLOAD > >> }; > >> > >> +struct hwip_reg_entry { > >> + u32 hwip; > >> + u32 inst; > >> + u32 seg; > >> + u32 reg_offset; > >> +}; > >> + > >> +struct reg_pair { > >> + u32 offset; > >> + u32 value; > >> +}; > >> + > >> struct amdgpu_watchdog_timer { > >> bool timeout_fatal_disable; > >> uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */ > >> @@ -1152,6 +1164,10 @@ struct amdgpu_device { > >> bool debug_largebar; > >> bool debug_disable_soft_recovery; > >> bool debug_use_vram_fw_buf; > >> + > >> + /* IP register dump */ > >> + struct reg_pair *ip_dump; > >> + uint32_t num_regs; > >> }; > >> > >> static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > >> index 1129e5e5fb42..2079f67c9fac 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > >> @@ -261,6 +261,18 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, > >> drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr); > >> drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status); > >> > >> + /* Add IP dump for each ip */ > >> + if (coredump->adev->ip_dump != NULL) { > >> + struct reg_pair *pair; > >> + > >> + pair = (struct reg_pair *)coredump->adev->ip_dump; > >> + drm_printf(&p, "IP register dump\n"); > >> + drm_printf(&p, "Offset \t Value\n"); > >> + for (int i = 0; i < coredump->adev->num_regs; i++) > >> + drm_printf(&p, "0x%04x \t 0x%08x\n", pair[i].offset, pair[i].value); > >> + drm_printf(&p, "\n"); > >> + } > >> + > >> /* Add ring buffer information */ > >> drm_printf(&p, "Ring buffer information\n"); > >> for (int i = 0; i < coredump->adev->num_rings; i++) { > >> @@ -299,6 +311,11 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, > >> > >> static void amdgpu_devcoredump_free(void *data) > >> { > >> + struct amdgpu_coredump_info *temp = data; > >> + > >> + kfree(temp->adev->ip_dump); > >> + temp->adev->ip_dump = NULL; > >> + temp->adev->num_regs = 0; > >> kfree(data); > >> } > >> > >> @@ -337,6 +354,11 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > >> > >> coredump->adev = adev; > >> > >> + /* Trigger ip dump here to capture the value of registers */ > >> + for (int i = 0; i < adev->num_ip_blocks; i++) > >> + if (adev->ip_blocks[i].version->funcs->dump_ip_state) > >> + adev->ip_blocks[i].version->funcs->dump_ip_state((void *)adev); > >> + > > This seems too complicated. I think it would be easier to > This is how all other per IP functions are called. What do you suggest ? > > > >> ktime_get_ts64(&coredump->reset_time); > >> > >> dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, > >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >> index a0bc4196ff8b..66e2915a8b4d 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >> @@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin"); > >> MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin"); > >> MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin"); > >> > >> +static const struct hwip_reg_entry gc_reg_list_10_1[] = { > >> + { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS3) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_STALLED_STAT1) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_STALLED_STAT2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_STALLED_STAT1) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_STALLED_STAT1) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_BUSY_STAT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_BUSY_STAT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_BUSY_STAT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_BUSY_STAT2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_BUSY_STAT2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_GFX_ERROR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_GFX_HPD_STATUS0) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB_BASE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB_RPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB_WPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB0_BASE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB0_RPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB0_WPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB1_BASE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB1_RPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB1_WPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB2_BASE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB2_WPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_RB2_WPTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_CMD_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_CMD_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_CMD_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_CMD_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_BASE_LO) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_BASE_HI) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_BASE_LO) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_BASE_HI) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_BASE_LO) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_BASE_HI) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_BASE_LO) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_BASE_HI) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_BUFSZ) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCPF_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCPC_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCPG_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmGDS_PROTECTION_FAULT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmGDS_VM_PROTECTION_FAULT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmIA_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmIA_UTCL1_STATUS_2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmPA_CL_CNTL_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRMI_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmSQC_DCACHE_UTCL0_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmSQC_ICACHE_UTCL0_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmSQG_UTCL0_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmTCP_UTCL0_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmWD_UTCL1_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_DEBUG) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_MEC_CNTL) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_MES_CNTL) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CE_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_MEC1_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_MEC2_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_MES_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_ME_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_PFP_INSTR_PNTR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_STAT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_COMMAND) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_MESSAGE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_1) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_3) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_4) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmSMU_RLC_RESPONSE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SAFE_MODE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_SAFE_MODE) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_RLCS_GPM_STAT_2) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_SPP_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_INT_STAT) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_GENERAL_6) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_DEBUG_INST_A) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_DEBUG_INST_B) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR) }, > >> + { SOC15_REG_ENTRY(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST) } > >> +}; > >> + > >> static const struct soc15_reg_golden golden_settings_gc_10_1[] = { > >> SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0xffffffff, 0x00400014), > >> SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, 0xfcff8fff, 0xf8000100), > >> @@ -9154,6 +9247,38 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring) > >> amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */ > >> } > >> > >> +static void gfx_v10_ip_dump(void *handle) > >> +{ > >> + struct amdgpu_device *adev = (struct amdgpu_device *)handle; > >> + uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1); > >> + struct reg_pair *ptr, *gfx10_pair; > >> + uint32_t new_count; > >> + > >> + /* > >> + * Allocate memory based on the no of registers for each block. > >> + * Since adev->ip_dump is common pointer and we add more > >> + * memory to it as we move through different IP's of the ASIC. > >> + * > >> + * This memory needs to be freed after gpu reset once the ip > >> + * dump is done in the devcoredump. > >> + */ > >> + new_count = adev->num_regs + reg_count; > >> + ptr = krealloc_array(adev->ip_dump, new_count, sizeof(struct reg_pair), GFP_KERNEL); > >> + if (ptr) { > >> + adev->ip_dump = ptr; > >> + gfx10_pair = &ptr[adev->num_regs]; > >> + adev->num_regs = new_count; > >> + } else { > >> + DRM_ERROR("Failed to allocate memory for IP Dump\n"); > >> + return; > >> + } > >> + > >> + for (uint32_t i = 0; i < reg_count; i++) { > > Some compilers don't like variables defined within code. > > Sure > > > > > >> + gfx10_pair[i].offset = gc_reg_list_10_1[i].reg_offset; > >> + gfx10_pair[i].value = RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i])); > >> + } > >> +} > > This seems too complicated. Just pass drm_printer to the ip_dump > > callbacks and let each IP dump their state directly. E.g., > > > > for (i = 0; i < adev->num_ip_blocks; i++) { > > if (adev->ip_blocks[i].version->funcs->dump_ip_state) { > > drm_printf(&p, "$s IP state\n", > > adev->ip_blocks[i].version->funcs->name); > > > > adev->ip_blocks[i].version->funcs->dump_ip_state((void *)adev, &p); > > } > > } > > I did try this but there is a problem in that approach. drm_printer is > created on the buffer which is provided by "amdgpu_devcoredump_read" and > "amdgpu_devcoredump_read" is only called when the user opens the file > > cat /sys/class/devcoredump/devcd1/data and by that time gpu is reset and > registers dont hold the value of interest. I tried this approach first > and got all FFFF or reset values of the registers. > We need to capture the values in "amdgpu_coredump" which is immediately > called after GPU reset and values are what we need for debugging. > > Crux is we need to capture data after gpu reset but read depends on when > user open the file which is too late and hence we need to capture all > the data in advance immediately after reset before recovery. Ah, ok. I would suggest pre-allocating the memory to store the dump state in each IP block in sw_init(). I guess we'd probably need two IP callbacks, dump_state() and print_state(). Then call the ip->dump_state() in the GPU recovery sequence before we try soft resets, then in devcore dump you can call the ip->print_state() callback to print the state to the devcore dump output. Alex > > Regards > > Sunil Khatri > > > > > Alex > > > >> + > >> static const struct amd_ip_funcs gfx_v10_0_ip_funcs = { > >> .name = "gfx_v10_0", > >> .early_init = gfx_v10_0_early_init, > >> @@ -9170,7 +9295,7 @@ static const struct amd_ip_funcs gfx_v10_0_ip_funcs = { > >> .set_clockgating_state = gfx_v10_0_set_clockgating_state, > >> .set_powergating_state = gfx_v10_0_set_powergating_state, > >> .get_clockgating_state = gfx_v10_0_get_clockgating_state, > >> - .dump_ip_state = NULL, > >> + .dump_ip_state = gfx_v10_ip_dump, > >> }; > >> > >> static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { > >> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >> index 4908044f7409..4c8e7fdb6976 100644 > >> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h > >> @@ -4830,6 +4830,8 @@ > >> #define mmCP_ECC_FIRSTOCCURRENCE_RING2_BASE_IDX 0 > >> #define mmGB_EDC_MODE 0x1e1e > >> #define mmGB_EDC_MODE_BASE_IDX 0 > >> +#define mmCP_DEBUG 0x1e1f > >> +#define mmCP_DEBUG_BASE_IDX 0 > >> #define mmCP_FETCHER_SOURCE 0x1e22 > >> #define mmCP_FETCHER_SOURCE_BASE_IDX 0 > >> #define mmCP_PQ_WPTR_POLL_CNTL 0x1e23 > >> @@ -7778,6 +7780,8 @@ > >> #define mmCP_MES_DOORBELL_CONTROL5_BASE_IDX 1 > >> #define mmCP_MES_DOORBELL_CONTROL6 0x2841 > >> #define mmCP_MES_DOORBELL_CONTROL6_BASE_IDX 1 > >> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR 0x2842 > >> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR_BASE_IDX 1 > >> #define mmCP_MES_GP0_LO 0x2843 > >> #define mmCP_MES_GP0_LO_BASE_IDX 1 > >> #define mmCP_MES_GP0_HI 0x2844 > >> @@ -9332,10 +9336,16 @@ > >> #define mmRLC_LB_CNTR_INIT_1_BASE_IDX 1 > >> #define mmRLC_LB_CNTR_1 0x4c1c > >> #define mmRLC_LB_CNTR_1_BASE_IDX 1 > >> +#define mmRLC_GPM_DEBUG_INST_ADDR 0x4c1d > >> +#define mmRLC_GPM_DEBUG_INST_ADDR_BASE_IDX 1 > >> #define mmRLC_JUMP_TABLE_RESTORE 0x4c1e > >> #define mmRLC_JUMP_TABLE_RESTORE_BASE_IDX 1 > >> #define mmRLC_PG_DELAY_2 0x4c1f > >> #define mmRLC_PG_DELAY_2_BASE_IDX 1 > >> +#define mmRLC_GPM_DEBUG_INST_A 0x4c22 > >> +#define mmRLC_GPM_DEBUG_INST_A_BASE_IDX 1 > >> +#define mmRLC_GPM_DEBUG_INST_B 0x4c23 > >> +#define mmRLC_GPM_DEBUG_INST_B_BASE_IDX 1 > >> #define mmRLC_GPU_CLOCK_COUNT_LSB 0x4c24 > >> #define mmRLC_GPU_CLOCK_COUNT_LSB_BASE_IDX 1 > >> #define mmRLC_GPU_CLOCK_COUNT_MSB 0x4c25 > >> @@ -9720,6 +9730,8 @@ > >> #define mmRLC_SPM_THREAD_TRACE_CTRL_BASE_IDX 1 > >> #define mmRLC_LB_CNTR_2 0x4de7 > >> #define mmRLC_LB_CNTR_2_BASE_IDX 1 > >> +#define mmRLC_LX6_CORE_PDEBUG_INST 0x4deb > >> +#define mmRLC_LX6_CORE_PDEBUG_INST_BASE_IDX 1 > >> #define mmRLC_CPAXI_DOORBELL_MON_CTRL 0x4df1 > >> #define mmRLC_CPAXI_DOORBELL_MON_CTRL_BASE_IDX 1 > >> #define mmRLC_CPAXI_DOORBELL_MON_STAT 0x4df2 > >> -- > >> 2.34.1 > >>