Smita Koralahalli wrote: > On 1/8/2024 8:58 AM, Jonathan Cameron wrote: > > On Wed, 20 Dec 2023 16:17:27 -0800 > > Ira Weiny <ira.weiny@xxxxxxxxx> wrote: > > > >> Series status/background > >> ======================== > >> > >> Smita has been a great help with this series. Thank you again! > >> > >> Smita's testing found that the GHES code ended up printing the events > >> twice. This version avoids the duplicate print by calling the callback > >> from the GHES code instead of the EFI code as suggested by Dan. > > > > I'm not sure this is working as intended. > > > > There is nothing gating the call in ghes_proc() of ghes_print_estatus() > > and now the EFI code handling that pretty printed things is missing we get > > the horrible kernel logging for an unknown block instead. > > > > So I think we need some minimal code in cper.c to match the guids then not > > log them (on basis we are arguing there is no need for new cper records). > > Otherwise we are in for some messy kernel logs > > > > Something like: > > > > {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 > > {1}[Hardware Error]: event severity: recoverable > > {1}[Hardware Error]: Error 0, type: recoverable > > {1}[Hardware Error]: section type: unknown, fbcd0a77-c260-417f-85a9-088b1621eba6 > > {1}[Hardware Error]: section length: 0x90 > > {1}[Hardware Error]: 00000000: 00000090 00000007 00000000 0d938086 ................ > > {1}[Hardware Error]: 00000010: 00100000 00000000 00040000 00000000 ................ > > {1}[Hardware Error]: 00000020: 00000000 00000000 00000000 00000000 ................ > > {1}[Hardware Error]: 00000030: 00000000 00000000 00000000 00000000 ................ > > {1}[Hardware Error]: 00000040: 00000000 00000000 00000000 00000000 ................ > > {1}[Hardware Error]: 00000050: 00000000 00000000 00000000 00000000 ................ > > {1}[Hardware Error]: 00000060: 00000000 00000000 00000000 00000000 ................ > > {1}[Hardware Error]: 00000070: 00000000 00000000 00000000 00000000 ................ > > {1}[Hardware Error]: 00000080: 00000000 00000000 00000000 00000000 ................ > > cxl_general_media: memdev=mem1 host=0000:10:00.0 serial=4 log=Informational : time=0 uuid=fbcd0a77-c260-417f-85a9-088b1621eba6 len=0 flags='' handle=0 related_handle=0 maint_op_class=0 : dpa=0 dpa_flags='' descriptor='' type='ECC Error' transaction_type='Unknown' channel=0 rank=0 device=0 comp_id=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 validity_flags='' > > > > (I'm filling the record with 0s currently) > > Yeah, when I tested this, I thought its okay for the hexdump to be there > in dmesg from EFI as the handling is done in trace events from GHES. > > If, we need to handle from EFI, then it would be a good reason to move > the GUIDs out from GHES and place it in a common location for EFI/cper > to share similar to protocol errors. Ah, yes, my expectation was more aligned with Jonathan's observation to do the processing in GHES code *and* skip the processing in the CPER code, something like: diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 56a5d2ef9e0a..e13e5fa4df4b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -666,30 +666,6 @@ static cxl_cper_callback cper_callback; /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -#define CPER_SEC_CXL_GEN_MEDIA_GUID \ - GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ - 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) - -/* - * DRAM Event Record - * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 - */ -#define CPER_SEC_CXL_DRAM_GUID \ - GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ - 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) - -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -#define CPER_SEC_CXL_MEM_MODULE_GUID \ - GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ - 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) - static void cxl_cper_post_event(enum cxl_event_type event_type, struct cxl_cper_event_rec *rec) { diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 35c37f667781..0a4eed470750 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -24,6 +24,7 @@ #include <linux/bcd.h> #include <acpi/ghes.h> #include <ras/ras_event.h> +#include <linux/cxl-event.h> #include "cper_cxl.h" /* @@ -607,6 +608,15 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata cper_print_prot_err(newpfx, prot_err); else goto err_section_too_small; + } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) { + printk("%ssection_type: CXL General Media Error\n", newpfx); + /* see: cxl_cper_event_call() */ + } else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) { + printk("%ssection_type: CXL DRAM Error\n", newpfx); + /* see: cxl_cper_event_call() */ + } else if (guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) { + printk("%ssection_type: CXL Memory Module Error\n", newpfx); + /* see: cxl_cper_event_call() */ } else { const void *err = acpi_hest_get_payload(gdata); diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 17eadee819b6..6d9a7df88d4a 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -1,12 +1,31 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CXL_EVENT_H #define _LINUX_CXL_EVENT_H +#include <linux/uuid.h> /* - * CXL event records; CXL rev 3.0 - * - * Copyright(c) 2023 Intel Corporation. + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CPER_SEC_CXL_GEN_MEDIA_GUID \ + GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ + 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) + +/* + * DRAM Event Record + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CPER_SEC_CXL_DRAM_GUID \ + GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ + 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) + +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 */ +#define CPER_SEC_CXL_MEM_MODULE_GUID \ + GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ + 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) struct cxl_event_record_hdr { u8 length;