From: Avadhut Naik <Avadhut.Naik@xxxxxxx> AMD's Scalable MCA systems viz. Genoa will include two new registers: MCA_SYND1 and MCA_SYND2. These registers will include supplemental error information in addition to the existing MCA_SYND register. The data within the registers is considered valid if MCA_STATUS[SyndV] is set. Add fields for these registers as vendor-specific error information in struct mce_hw_err. Save and print these registers wherever MCA_STATUS[SyndV]/MCA_SYND is currently used. Also, modify the mce_record tracepoint to export these new registers through __dynamic_array. While the sizeof() operator has been used to determine the size of this __dynamic_array, the same, if needed in the future can be substituted by caching the size of vendor-specific error information as part of struct mce_hw_err. Note: Checkpatch warnings/errors are ignored to maintain coding style. [Yazen: Drop Yazen's Co-developed-by tag and moved SoB tag.] [Yazen: Change %Lx to %llx in TP_printk().] Signed-off-by: Avadhut Naik <Avadhut.Naik@xxxxxxx> Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx> --- arch/x86/include/asm/mce.h | 12 ++++++++++++ arch/x86/kernel/cpu/mce/core.c | 26 ++++++++++++++++++-------- drivers/edac/mce_amd.c | 10 +++++++--- include/trace/events/mce.h | 9 +++++++-- 4 files changed, 44 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 99eb72dd7d05..1bd3f1e41dbb 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -122,6 +122,9 @@ #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +/* Registers MISC2 to MISC4 are at offsets B to D. */ +#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e +#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f #define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) @@ -132,6 +135,8 @@ #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) +#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x)) #define XEC(x, mask) (((x) >> 16) & mask) @@ -189,6 +194,13 @@ enum mce_notifier_prios { struct mce_hw_err { struct mce m; + + union vendor_info { + struct { + u64 synd1; + u64 synd2; + } amd; + } vi; }; struct notifier_block; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8db8ed34b200..e153a21bdb1b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -198,6 +198,10 @@ static void __print_mce(struct mce_hw_err *err) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vi.amd.synd1) + pr_cont("SYND1 %llx ", err->vi.amd.synd1); + if (err->vi.amd.synd2) + pr_cont("SYND2 %llx ", err->vi.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -633,8 +637,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -656,8 +662,11 @@ static noinstr void mce_read_aux(struct mce *m, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vi.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vi.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } @@ -723,7 +732,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* If this entry is not valid, ignore it */ if (!(m->status & MCI_STATUS_VAL)) { if (smca_destat_is_valid(i)) { - mce_read_aux(m, i); + mce_read_aux(&err, i); goto clear_it; } @@ -773,7 +782,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(m, i); + mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -915,9 +924,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -935,7 +945,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -1333,7 +1343,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1534,7 +1544,7 @@ noinstr void do_machine_check(struct pt_regs *regs) final = this_cpu_ptr(&hw_errs_seen); final->m = *m; - no_way_out = mce_no_way_out(m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 701bc9556414..4d2929770620 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1275,7 +1275,8 @@ static const char *decode_error_status(struct mce *m) static int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = (struct mce_hw_err *)data; + struct mce *m = &err->m; unsigned int fam = x86_family(m->cpuid); int ecc; @@ -1333,8 +1334,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (boot_cpu_has(X86_FEATURE_SMCA)) { pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); - if (m->status & MCI_STATUS_SYNDV) - pr_cont(", Syndrome: 0x%016llx", m->synd); + if (m->status & MCI_STATUS_SYNDV) { + pr_cont(", Syndrome: 0x%016llx\n", m->synd); + pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", + err->vi.amd.synd1, err->vi.amd.synd2); + } pr_cont("\n"); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index b093cb28f6dd..29d079961aac 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -33,6 +33,8 @@ TRACE_EVENT(mce_record, __field( u8, cs ) __field( u8, bank ) __field( u8, cpuvendor ) + __field( u8, len ) + __dynamic_array(u8, v_data, sizeof(err->vi)) ), TP_fast_assign( @@ -53,9 +55,11 @@ TRACE_EVENT(mce_record, __entry->cs = err->m.cs; __entry->bank = err->m.bank; __entry->cpuvendor = err->m.cpuvendor; + __entry->len = sizeof(err->vi); + memcpy(__get_dynamic_array(v_data), &err->vi, sizeof(err->vi)); ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR/MISC/SYND: %016llx/%016llx/%016llx, RIP: %02x:<%016llx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x, Vendor Data: %s", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, @@ -66,7 +70,8 @@ TRACE_EVENT(mce_record, __entry->cpuvendor, __entry->cpuid, __entry->walltime, __entry->socketid, - __entry->apicid) + __entry->apicid, + __print_array(__get_dynamic_array(v_data), __entry->len / 8, 8)) ); #endif /* _TRACE_MCE_H */ -- 2.34.1