RAS errors are typically exposed to user-space programs using tracepoints, allowing tools like rasdaemon to decode and post-process them. AMDGPU might also follow this, offering the following benefits: 1. It can proactively notify users of RAS events, eliminating the need to monitor /dev/kmsg. 2. It allows for further post-processing similar to AMD SMCA[1]. [1]: https://github.com/mchehab/rasdaemon/commit/932118 Signed-off-by: Ruidong Tian <tianruidong@xxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 31 +++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 3ca03b5e0f91..9daa95365457 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -23,6 +23,7 @@ #include "amdgpu_ras.h" #include "amdgpu.h" #include "amdgpu_mca.h" +#include "amdgpu_trace.h" #include "umc/umc_6_7_0_offset.h" #include "umc/umc_6_7_0_sh_mask.h" @@ -287,6 +288,8 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st idx, entry->regs[MCA_REG_IDX_IPID]); RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", idx, entry->regs[MCA_REG_IDX_SYND]); + + trace_amdgpu_mca_bank_dumps(event_id, idx, entry); } static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index 383fce40d4dd..a0ba79394099 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -554,6 +554,37 @@ TRACE_EVENT(amdgpu_reset_reg_dumps, __entry->value) ); +TRACE_EVENT(amdgpu_mca_bank_dumps, + TP_PROTO(uint64_t event_id, int idx, struct mca_bank_entry *e), + TP_ARGS(event_id, idx, e), + TP_STRUCT__entry( + __field(uint64_t, event_id) + __field(int, idx) + __field(uint64_t, status) + __field(uint64_t, addr) + __field(uint64_t, misc0) + __field(uint64_t, ipid) + __field(uint64_t, synd) + ), + TP_fast_assign( + __entry->event_id = event_id; + __entry->idx = idx; + __entry->status = e->regs[MCA_REG_IDX_STATUS]; + __entry->addr = e->regs[MCA_REG_IDX_ADDR]; + __entry->misc0 = e->regs[MCA_REG_IDX_MISC0]; + __entry->ipid = e->regs[MCA_REG_IDX_IPID]; + __entry->synd = e->regs[MCA_REG_IDX_SYND]; + ), + TP_printk("amdgpu mca bank dump: event_id: %lld, idx: %d, STATUS: %016llx, ADDR: %016llx, MISC0: %016llx, IPID: %016llx, SYND: %016llx", + __entry->event_id, + __entry->idx, + __entry->status, + __entry->addr, + __entry->misc0, + __entry->ipid, + __entry->synd) +); + #undef AMDGPU_JOB_GET_TIMELINE_NAME #endif -- 2.33.1