RE: [PATCH v5 2/2] RAS: Report ARM processor information to userspace

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Tested-by: Shiju Jose <shiju.jose@xxxxxxxxxx>

CPU core isolation feature in rasdaemon has dependency on this kernel patch.

Thanks,
Shiju
>-----Original Message-----
>From: Daniel Ferguson <danielf@xxxxxxxxxxxxxxxxxxxxxx>
>Sent: 21 March 2024 22:56
>To: Rafael J. Wysocki <rafael@xxxxxxxxxx>; Len Brown <lenb@xxxxxxxxxx>;
>James Morse <james.morse@xxxxxxx>; Tony Luck <tony.luck@xxxxxxxxx>;
>Borislav Petkov <bp@xxxxxxxxx>
>Cc: linux-acpi@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; linux-
>edac@xxxxxxxxxxxxxxx; Daniel Ferguson <danielf@xxxxxxxxxxxxxxxxxxxxxx>;
>luoshengwei <luoshengwei@xxxxxxxxxx>; Jason Tian
><jason@xxxxxxxxxxxxxxxxxxxxxx>
>Subject: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
>
>From: Shengwei Luo <luoshengwei@xxxxxxxxxx>
>
>The original arm_event trace code only traces out ARM processor error
>information data. It's not enough for user to take appropriate action.
>
>According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error
>section includes several ARM processor error information, several ARM
>processor context information and several vendor specific error information
>structures. In addition to these info, there are error severity and cpu logical
>index about the event. Report all of these information to userspace via perf i/f.
>So that the user can do cpu core isolation according to error severity and other
>info.
>
>Signed-off-by: Shengwei Luo <luoshengwei@xxxxxxxxxx>
>Signed-off-by: Jason Tian <jason@xxxxxxxxxxxxxxxxxxxxxx>
>Signed-off-by: Daniel Ferguson <danielf@xxxxxxxxxxxxxxxxxxxxxx>
>---
> drivers/acpi/apei/ghes.c |  3 +--
> drivers/ras/ras.c        | 46
>++++++++++++++++++++++++++++++++++++++++++++--
> include/linux/ras.h      | 15 ++++++++++++---
> include/ras/ras_event.h  | 48
>+++++++++++++++++++++++++++++++++++++++++++-----
> 4 files changed, 100 insertions(+), 12 deletions(-)
>
>diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>58014558b8e0..a93c80fe1bab 100644
>--- a/drivers/acpi/apei/ghes.c
>+++ b/drivers/acpi/apei/ghes.c
>@@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct
>acpi_hest_generic_data *gdata,
> 	int sec_sev, i;
> 	char *p;
>
>-	log_arm_hw_error(err);
>-
> 	sec_sev = ghes_severity(gdata->error_severity);
>+	log_arm_hw_error(err, sec_sev);
> 	if (sev != GHES_SEV_RECOVERABLE || sec_sev !=
>GHES_SEV_RECOVERABLE)
> 		return false;
>
>diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index
>249dce21a738..3e2beed2db07 100644
>--- a/drivers/ras/ras.c
>+++ b/drivers/ras/ras.c
>@@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type,
>const guid_t *fru_id,  }
>
> #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err)
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
> {
>-	trace_arm_event(err);
>+	u32 pei_len;
>+	u32 ctx_len = 0;
>+	s32 vsei_len;
>+	u8 *pei_err;
>+	u8 *ctx_err;
>+	u8 *ven_err_data;
>+	struct cper_arm_err_info *err_info;
>+	struct cper_arm_ctx_info *ctx_info;
>+	int n, sz;
>+	int cpu;
>+
>+	pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
>+	pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm);
>+
>+	err_info = (struct cper_arm_err_info *)(err + 1);
>+	ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
>+	ctx_err = (u8 *)ctx_info;
>+	for (n = 0; n < err->context_info_num; n++) {
>+		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
>+		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
>+		ctx_len += sz;
>+	}
>+
>+	vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) +
>+						pei_len + ctx_len);
>+	if (vsei_len < 0) {
>+		pr_warn(FW_BUG
>+			"section length: %d\n", err->section_length);
>+		pr_warn(FW_BUG
>+			"section length is too small\n");
>+		pr_warn(FW_BUG
>+			"firmware-generated error record is incorrect\n");
>+		vsei_len = 0;
>+	}
>+	ven_err_data = (u8 *)ctx_info;
>+
>+	cpu = GET_LOGICAL_INDEX(err->mpidr);
>+	/* when return value is invalid, set cpu index to -1 */
>+	if (cpu < 0)
>+		cpu = -1;
>+
>+	trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
>+			ven_err_data, (u32)vsei_len, sev, cpu);
> }
> #endif
>
>diff --git a/include/linux/ras.h b/include/linux/ras.h index
>811feb9d8160..2070e4ae0626 100644
>--- a/include/linux/ras.h
>+++ b/include/linux/ras.h
>@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
> 			    const guid_t *fru_id, const char *fru_text,
> 			    const u8 sev, const u8 *err, const u32 len);  #if
>defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err);
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
> #endif
> #else
> static inline void
>@@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type,  { return;
>}  #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)  static inline void -
>log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
>+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return;
>+}
> #endif
> #endif
>
>@@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err
>*err) { }  static inline unsigned long
>amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL;
>}  #endif /* CONFIG_AMD_ATL */
>-
>+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #include
>+<asm/smp_plat.h>
>+/*
>+ * Include ARM specific SMP header which provides a function mapping
>+mpidr to
>+ * cpu logical index.
>+ */
>+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr &
>+MPIDR_HWID_BITMASK) #else #define GET_LOGICAL_INDEX(mpidr) -EINVAL
>+#endif /* CONFIG_ARM || CONFIG_ARM64 */
> #endif /* __RAS_H__ */
>diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index
>c011ea236e9b..a7d7b6e717b6 100644
>--- a/include/ras/ras_event.h
>+++ b/include/ras/ras_event.h
>@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event,
>  * This event is generated when hardware detects an ARM processor error
>  * has occurred. UEFI 2.6 spec section N.2.4.4.
>  */
>+#define APEIL "ARM Processor Err Info data len"
>+#define APEID "ARM Processor Err Info raw data"
>+#define APECIL "ARM Processor Err Context Info data len"
>+#define APECID "ARM Processor Err Context Info raw data"
>+#define VSEIL "Vendor Specific Err Info data len"
>+#define VSEID "Vendor Specific Err Info raw data"
> TRACE_EVENT(arm_event,
>
>-	TP_PROTO(const struct cper_sec_proc_arm *proc),
>+	TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err,
>+			const u32 pei_len,
>+			const u8 *ctx_err,
>+			const u32 ctx_len,
>+			const u8 *oem,
>+			const u32 oem_len,
>+			u8 sev,
>+			int cpu),
>
>-	TP_ARGS(proc),
>+	TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev,
>+cpu),
>
> 	TP_STRUCT__entry(
> 		__field(u64, mpidr)
>@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event,
> 		__field(u32, running_state)
> 		__field(u32, psci_state)
> 		__field(u8, affinity)
>+		__field(u32, pei_len)
>+		__dynamic_array(u8, buf, pei_len)
>+		__field(u32, ctx_len)
>+		__dynamic_array(u8, buf1, ctx_len)
>+		__field(u32, oem_len)
>+		__dynamic_array(u8, buf2, oem_len)
>+		__field(u8, sev)
>+		__field(int, cpu)
> 	),
>
> 	TP_fast_assign(
>@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event,
> 			__entry->running_state = ~0;
> 			__entry->psci_state = ~0;
> 		}
>+		__entry->pei_len = pei_len;
>+		memcpy(__get_dynamic_array(buf), pei_err, pei_len);
>+		__entry->ctx_len = ctx_len;
>+		memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len);
>+		__entry->oem_len = oem_len;
>+		memcpy(__get_dynamic_array(buf2), oem, oem_len);
>+		__entry->sev = sev;
>+		__entry->cpu = cpu;
> 	),
>
>-	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>-		  "running state: %d; PSCI state: %d",
>+	TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR:
>%016llx; "
>+		  "running state: %d; PSCI state: %d; "
>+		  "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
>+		  __entry->cpu,
>+		  __entry->sev,
> 		  __entry->affinity, __entry->mpidr, __entry->midr,
>-		  __entry->running_state, __entry->psci_state)
>+		  __entry->running_state, __entry->psci_state,
>+		  APEIL, __entry->pei_len, APEID,
>+		  __print_hex(__get_dynamic_array(buf), __entry->pei_len),
>+		  APECIL, __entry->ctx_len, APECID,
>+		  __print_hex(__get_dynamic_array(buf1), __entry->ctx_len),
>+		  VSEIL, __entry->oem_len, VSEID,
>+		  __print_hex(__get_dynamic_array(buf2), __entry->oem_len))
> );
>
> /*
>
>--
>2.43.0
>





[Index of Archives]     [Linux IBM ACPI]     [Linux Power Management]     [Linux Kernel]     [Linux Laptop]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]
  Powered by Linux