RE: [PATCH v3 08/11] cxl/pci: add tracepoint events for CXL RAS

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Dave,

Please see few comments.

>-----Original Message-----
>From: Dave Jiang <dave.jiang@xxxxxxxxx>
>Sent: 18 November 2022 17:09
>To: linux-cxl@xxxxxxxxxxxxxxx; linux-pci@xxxxxxxxxxxxxxx
>Cc: dan.j.williams@xxxxxxxxx; ira.weiny@xxxxxxxxx; vishal.l.verma@xxxxxxxxx;
>alison.schofield@xxxxxxxxx; Jonathan Cameron
><jonathan.cameron@xxxxxxxxxx>; rostedt@xxxxxxxxxxx;
>terry.bowman@xxxxxxx; bhelgaas@xxxxxxxxxx
>Subject: [PATCH v3 08/11] cxl/pci: add tracepoint events for CXL RAS
>
>Add tracepoint events for recording the CXL uncorrectable and correctable
>errors. For uncorrectable errors, there is additional data of 512B from the
>header log register (CXL spec rev3 8.2.4.16.7). The trace event will intake a
>dynamic array that will dump the entire Header Log data. If multiple errors are
>set in the status register, then the 'first error' field (CXL spec rev3 v8.2.4.16.6)
>is read from the Error Capabilities and Control Register in order to determine
>the error.
>
>This implementation does not include CXL IDE Error details.
>
>Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
>Signed-off-by: Dave Jiang <dave.jiang@xxxxxxxxx>
>---
> drivers/cxl/pci.c          |    2 +
> include/trace/events/cxl.h |  110
>++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 112 insertions(+)
> create mode 100644 include/trace/events/cxl.h
>
>diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index
>9428f3e0d99b..0f36a5861a7b 100644
>--- a/drivers/cxl/pci.c
>+++ b/drivers/cxl/pci.c
>@@ -13,6 +13,8 @@
> #include "cxlmem.h"
> #include "cxlpci.h"
> #include "cxl.h"
>+#define CREATE_TRACE_POINTS
>+#include <trace/events/cxl.h>
>
> /**
>  * DOC: cxl pci
>diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h new file
>mode 100644 index 000000000000..f8e95d977133
>--- /dev/null
>+++ b/include/trace/events/cxl.h
>@@ -0,0 +1,110 @@
>+/* SPDX-License-Identifier: GPL-2.0 */
>+#undef TRACE_SYSTEM
>+#define TRACE_SYSTEM cxl
>+
>+#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
>#define
>+_CXL_EVENTS_H
>+
>+#include <linux/tracepoint.h>
>+
>+#define CXL_HEADERLOG_SIZE		SZ_512
>+#define CXL_HEADERLOG_SIZE_U32		SZ_512 / sizeof(u32)
>+
>+#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
>+#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
>+#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
>+#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
>+#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
>+#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
>+#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
>+#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
>+#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
>+#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
>+#define CXL_RAS_UC_POISON		BIT(10)
>+#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
>+#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
>+#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
>+#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
>+
>+#define show_uc_errs(status)	__print_flags(status, " | ",
>	  \
>+	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },
>	  \
>+	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },
>	  \
>+	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" },
>\
>+	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },
>	  \
>+	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },
>	  \
>+	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error"
>},	  \
>+	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error"
>},  \
>+	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },
>	  \
>+	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },
>	  \
>+	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },
>	  \
>+	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },
>	  \
>+	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },
>	  \
>+	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
>+	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
>+	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
>+)
>+
>+TRACE_EVENT(cxl_aer_uncorrectable_error,
>+	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
>+	TP_ARGS(dev_name, status, fe, hl),
>+	TP_STRUCT__entry(
>+		__string(dev_name, dev_name)
>+		__field(u32, status)
>+		__field(u32, first_error)
>+		__dynamic_array(u32, header_log,
>CXL_HEADERLOG_SIZE_U32)
>+	),
>+	TP_fast_assign(
>+		__assign_str(dev_name, dev_name);
>+		__entry->status = status;
>+		__entry->first_error = fe;
>+		/*
>+		 * Embed the 512B headerlog data for user app retrieval and
>+		 * parsing, but no need to print this in the trace buffer.
>+		 */
>+		memcpy(__get_dynamic_array(header_log), hl,
>CXL_HEADERLOG_SIZE);
>+	),
>+	TP_printk("%s: status: '%s' first_error: '%s'",
>+		  __get_str(dev_name),
>+		  show_uc_errs(__entry->status),
>+		  show_uc_errs(__entry->first_error)
>+	)
>+);
>+
>+#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
>+#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
>+#define CXL_RAS_CE_CRC_THRESH		BIT(2)

I think the Bit Location 3  "Retry_Threshold: Retry Threshold Hit. "  as per the 
Correctable Error Status Register in the CXL 3.0 specification is missing?
If so, please correct the bit location of the subsequent corrected errors as well.
  
>+#define CXL_RAS_CE_CACHE_POISON		BIT(3)
>+#define CXL_RAS_CE_MEM_POISON		BIT(4)
>+#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(5)
>+
>+#define show_ce_errs(status)	__print_flags(status, " | ",
>		\
>+	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },
>		\
>+	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data Ecc Error" },

Please change "Ecc" to "ECC".

>		\
>+	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },
>		\
>+	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer"
>},		\
>+	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer"
>},		\
>+	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical
>Layer" }	\
>+)
>+
>+TRACE_EVENT(cxl_aer_correctable_error,
>+	TP_PROTO(const char *dev_name, u32 status),
>+	TP_ARGS(dev_name, status),
>+	TP_STRUCT__entry(
>+		__string(dev_name, dev_name)
>+		__field(u32, status)
>+	),
>+	TP_fast_assign(
>+		__assign_str(dev_name, dev_name);
>+		__entry->status = status;
>+	),
>+	TP_printk("%s: status: '%s'",
>+		  __get_str(dev_name), show_ce_errs(__entry->status)
>+	)
>+);
>+
>+#endif /* _CXL_EVENTS_H */
>+
>+/* This part must be outside protection */ #undef TRACE_INCLUDE_FILE
>+#define TRACE_INCLUDE_FILE cxl #include <trace/define_trace.h>
>

Thanks,
Shiju




[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux