Add support for decoding CXL Component Events Memory Module Event Record as defined in CXL rev 3.0 section 8.2.9.2.1.3. Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx> --- drivers/firmware/efi/cper.c | 8 +++ drivers/firmware/efi/cper_cxl.c | 110 ++++++++++++++++++++++++++++++++ drivers/firmware/efi/cper_cxl.h | 32 ++++++++++ 3 files changed, 150 insertions(+) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 1d182487fa13..5b45bf513512 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -623,6 +623,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata cper_print_dram(newpfx, dram); else goto err_section_too_small; + } else if (guid_equal(sec_type, &CPER_SEC_CXL_MM_MODULE)) { + struct cper_sec_comp_event *mm_module = acpi_hest_get_payload(gdata); + + printk("%ssection_type: CXL Memory Module Event\n", newpfx); + if (gdata->error_data_length >= sizeof(*mm_module)) + cper_print_mm_module(newpfx, mm_module); + else + goto err_section_too_small; } else { const void *err = acpi_hest_get_payload(gdata); diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c index 3fba360b7dc6..5be10ca20c7c 100644 --- a/drivers/firmware/efi/cper_cxl.c +++ b/drivers/firmware/efi/cper_cxl.c @@ -39,6 +39,11 @@ #define DRAM_VALID_COLUMN BIT_ULL(6) #define DRAM_VALID_CORRECTION_MASK BIT_ULL(7) +#define DHI_AS_LIFE_USED(as) (as & GENMASK(1, 0)) +#define DHI_AS_DEV_TEMP(as) (((as) & GENMASK(3, 2)) >> 2) +#define DHI_AS_COR_VOL_ERR_CNT(as) (((as) & GENMASK(4, 4)) >> 4) +#define DHI_AS_COR_PER_ERR_CNT(as) (((as) & GENMASK(5, 5)) >> 5) + /* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */ struct cxl_ras_capability_regs { u32 uncor_status; @@ -119,6 +124,45 @@ static const char * const dram_mem_type_strs[] = { "data path error", }; +static const char * const mm_module_event_type_strs[] = { + "health status change", + "media status change", + "life used change", + "temperature change", + "data path error", + "lsa error", +}; + +static const char * const dhi_health_status_strs[] = { + "maintenance needed", + "performance degraded", + "hardware replacement needed", +}; + +static const char * const dhi_media_status_strs[] = { + "normal", + "not ready", + "write persistency lost", + "all data lost", + "write persistency loss in the event of power loss", + "write persistency loss in event of shutdown", + "write persistency loss imminent", + "all data loss in the event of power loss", + "all data loss in the event of shutdown", + "all data loss imminent", +}; + +static const char * const dhi_two_bit_status_strs[] = { + "normal", + "warning", + "critical", +}; + +static const char * const dhi_one_bit_status_strs[] = { + "normal", + "warning", +}; + void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err) { if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE) @@ -409,3 +453,69 @@ void cper_print_dram(const char *pfx, const struct cper_sec_comp_event *event) dram->cor_mask, sizeof(dram->cor_mask), 0); } } + +static void cper_print_mm_module_dhi(const char *pfx, const struct dev_health_info *dhi) +{ + pr_info("%s health status: 0x%02x\n", pfx, dhi->health_status); + cper_print_bits(pfx, dhi->health_status, dhi_health_status_strs, + ARRAY_SIZE(dhi_health_status_strs)); + + pr_info("%s media status: %d, %s\n", pfx, dhi->media_status, + dhi->media_status < ARRAY_SIZE(dhi_media_status_strs) + ? dhi_media_status_strs[dhi->media_status] : "unknown"); + + pr_info("%s current life used: %ld, %s\n", pfx, + DHI_AS_LIFE_USED(dhi->add_status), + DHI_AS_LIFE_USED(dhi->add_status) < ARRAY_SIZE(dhi_two_bit_status_strs) + ? dhi_two_bit_status_strs[DHI_AS_LIFE_USED(dhi->add_status)] + : "unknown"); + + pr_info("%s current device temperature: %ld, %s\n", pfx, + DHI_AS_DEV_TEMP(dhi->add_status), + DHI_AS_DEV_TEMP(dhi->add_status) < ARRAY_SIZE(dhi_two_bit_status_strs) + ? dhi_two_bit_status_strs[DHI_AS_DEV_TEMP(dhi->add_status)] + : "unknown"); + + pr_info("%s current corrected volatile err count: %ld, %s\n", pfx, + DHI_AS_COR_VOL_ERR_CNT(dhi->add_status), + DHI_AS_COR_VOL_ERR_CNT(dhi->add_status) < ARRAY_SIZE(dhi_one_bit_status_strs) + ? dhi_one_bit_status_strs[DHI_AS_COR_VOL_ERR_CNT(dhi->add_status)] + : "unknown"); + + pr_info("%s current corrected persistent err count: %ld, %s\n", pfx, + DHI_AS_COR_PER_ERR_CNT(dhi->add_status), + DHI_AS_COR_PER_ERR_CNT(dhi->add_status) < ARRAY_SIZE(dhi_one_bit_status_strs) + ? dhi_one_bit_status_strs[DHI_AS_COR_PER_ERR_CNT(dhi->add_status)] + : "unknown"); + + pr_info("%s life used percent: 0x%02x\n", pfx, dhi->life_used); + pr_info("%s device temperature degree celsius: 0x%04x\n", pfx, + dhi->device_temp); + pr_info("%s dirty shutdown count: 0x%08x\n", pfx, + dhi->dirty_shutdown_cnt); + pr_info("%s total corrected volatile error count: 0x%08x\n", pfx, + dhi->cor_vol_err_cnt); + pr_info("%s total corrected persistent error count: 0x%08x\n", pfx, + dhi->cor_per_err_cnt); +} + +void cper_print_mm_module(const char *pfx, const struct cper_sec_comp_event *event) +{ + struct cper_sec_mm_module *mm_module; + + cper_print_comp_event(pfx, event); + + if (!(event->valid_bits & COMP_EVENT_VALID_EVENT_LOG)) + return; + + mm_module = (struct cper_sec_mm_module *)(event + 1); + + cper_print_event_record(pfx, &mm_module->record); + + pr_info("%s device event type: %d, %s\n", pfx, mm_module->event_type, + mm_module->event_type < ARRAY_SIZE(mm_module_event_type_strs) + ? mm_module_event_type_strs[mm_module->event_type] + : "unknown"); + + cper_print_mm_module_dhi(pfx, &mm_module->dhi); +} diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h index 967847b571cb..c37dd624a522 100644 --- a/drivers/firmware/efi/cper_cxl.h +++ b/drivers/firmware/efi/cper_cxl.h @@ -25,6 +25,11 @@ GUID_INIT(0x601DCBB3, 0x9C06, 0x4EAB, 0xB8, 0xAF, 0x4E, 0x9B, \ 0xFB, 0x5C, 0x96, 0x24) +/* CXL Memory Module Event Section */ +#define CPER_SEC_CXL_MM_MODULE \ + GUID_INIT(0xFE927475, 0xDD59, 0x4339, 0xA5, 0x86, 0x79, 0xBA, \ + 0xB1, 0x13, 0xB7, 0x74) + #pragma pack(1) /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ @@ -147,10 +152,37 @@ struct cper_sec_dram { u8 reserved[23]; }; +/* + * CXL Memory Module Event + * Device Health Information - DHI + * CXL rev 3.0 sec 8.2.9.8.3.1; Table 8-100 + */ +struct dev_health_info { + u8 health_status; + u8 media_status; + u8 add_status; + u8 life_used; + u16 device_temp; + u32 dirty_shutdown_cnt; + u32 cor_vol_err_cnt; + u32 cor_per_err_cnt; +}; + +/* CXL Memory Module Event Record + * CXL rev 3.0 sec 8.2.9.2.1.3; Table 8-45 + */ +struct cper_sec_mm_module { + struct common_event_record record; + u8 event_type; + struct dev_health_info dhi; + u8 reserved[61]; +}; + #pragma pack() void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err); void cper_print_gen_media(const char *pfx, const struct cper_sec_comp_event *event); void cper_print_dram(const char *pfx, const struct cper_sec_comp_event *event); +void cper_print_mm_module(const char *pfx, const struct cper_sec_comp_event *event); #endif //__CPER_CXL_ -- 2.17.1