Re: [PATCH 0/15] Enable CXL PCIe port protocol error handling and logging

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Fan,

On 10/17/2024 11:34 AM, Fan Ni wrote:
On Tue, Oct 08, 2024 at 05:16:42PM -0500, Terry Bowman wrote:
This is a continuation of the CXL port error handling RFC from earlier.[1]
The RFC resulted in the decision to add CXL PCIe port error handling to
the existing RCH downstream port handling. This patchset adds the CXL PCIe
port handling and logging.

The first 7 patches update the existing AER service driver to support CXL
PCIe port protocol error handling and reporting. This includes AER service
driver changes for adding correctable and uncorrectable error support, CXL
specific recovery handling, and addition of CXL driver callback handlers.

The following 8 patches address CXL driver support for CXL PCIe port
protocol errors. This includes the following changes to the CXL drivers:
mapping CXL port and downstream port RAS registers, interface updates for
common RCH and VH, adding port specific error handlers, and protocol error
logging.

[1] - https://lore.kernel.org/linux-cxl/20240617200411.1426554
-1-terry.bowman@xxxxxxx/

Testing:

Below are test results for this patchset. This is using Qemu with a root
port (0c:00.0), upstream switch port (0d:00.0),and downstream switch port
(0e:00.0).

This was tested using aer-inject updated to support CE and UCE internal
error injection. CXL RAS was set using a test patch (not upstreamed).

Hi Terry,
Can you share the aer-inject repo for the testing or the test patch?

Fan

Sure, but, its easiest to attach the patch here.

Origin was https://github.com/jderrick/aer-inject.git
Base is 81701cbb30e35a1a76c3876f55692f91bdb9751b

Regards,
Terry
From ca9277866b506723f46f3acd7b264ffa80c37276 Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@xxxxxxx>
Date: Thu, 17 Oct 2024 12:12:58 -0500
Subject: [PATCH] aer-inject: Add internal error injection

Add corrected (CE) and uncorrected (UCE) AER internal error injection
support.

Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
---
 aer.h   | 2 ++
 aer.lex | 2 ++
 aer.y   | 8 ++++----
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/aer.h b/aer.h
index a0ad152..e55a731 100644
--- a/aer.h
+++ b/aer.h
@@ -30,11 +30,13 @@ struct aer_error_inj
 #define  PCI_ERR_UNC_MALF_TLP	0x00040000	/* Malformed TLP */
 #define  PCI_ERR_UNC_ECRC	0x00080000	/* ECRC Error Status */
 #define  PCI_ERR_UNC_UNSUP	0x00100000	/* Unsupported Request */
+#define  PCI_ERR_UNC_INTERNAL   0x00400000      /* Internal error */
 #define  PCI_ERR_COR_RCVR	0x00000001	/* Receiver Error Status */
 #define  PCI_ERR_COR_BAD_TLP	0x00000040	/* Bad TLP Status */
 #define  PCI_ERR_COR_BAD_DLLP	0x00000080	/* Bad DLLP Status */
 #define  PCI_ERR_COR_REP_ROLL	0x00000100	/* REPLAY_NUM Rollover */
 #define  PCI_ERR_COR_REP_TIMER	0x00001000	/* Replay Timer Timeout */
+#define  PCI_ERR_COR_CINTERNAL	0x00004000	/* Internal error */
 
 extern void init_aer(struct aer_error_inj *err);
 extern void submit_aer(struct aer_error_inj *err);
diff --git a/aer.lex b/aer.lex
index 6121e4e..4fadd0e 100644
--- a/aer.lex
+++ b/aer.lex
@@ -82,11 +82,13 @@ static struct key {
 	KEYVAL(MALF_TLP, PCI_ERR_UNC_MALF_TLP),
 	KEYVAL(ECRC, PCI_ERR_UNC_ECRC),
 	KEYVAL(UNSUP, PCI_ERR_UNC_UNSUP),
+	KEYVAL(INTERNAL, PCI_ERR_UNC_INTERNAL),
 	KEYVAL(RCVR, PCI_ERR_COR_RCVR),
 	KEYVAL(BAD_TLP, PCI_ERR_COR_BAD_TLP),
 	KEYVAL(BAD_DLLP, PCI_ERR_COR_BAD_DLLP),
 	KEYVAL(REP_ROLL, PCI_ERR_COR_REP_ROLL),
 	KEYVAL(REP_TIMER, PCI_ERR_COR_REP_TIMER),
+	KEYVAL(CINTERNAL, PCI_ERR_COR_CINTERNAL),
 };
 
 static int cmp_key(const void *av, const void *bv)
diff --git a/aer.y b/aer.y
index e5ecc7d..500dc97 100644
--- a/aer.y
+++ b/aer.y
@@ -34,8 +34,8 @@ static void init(void);
 
 %token AER DOMAIN BUS DEV FN PCI_ID UNCOR_STATUS COR_STATUS HEADER_LOG
 %token <num> TRAIN DLP POISON_TLP FCP COMP_TIME COMP_ABORT UNX_COMP RX_OVER
-%token <num> MALF_TLP ECRC UNSUP
-%token <num> RCVR BAD_TLP BAD_DLLP REP_ROLL REP_TIMER
+%token <num> MALF_TLP ECRC UNSUP INTERNAL
+%token <num> RCVR BAD_TLP BAD_DLLP REP_ROLL REP_TIMER CINTERNAL
 %token <num> SYMBOL NUMBER
 %token <str> PCI_ID_STR
 
@@ -77,14 +77,14 @@ uncor_status_list: /* empty */			{ $$ = 0; }
 	;
 
 uncor_status: TRAIN | DLP | POISON_TLP | FCP | COMP_TIME | COMP_ABORT
-	| UNX_COMP | RX_OVER | MALF_TLP | ECRC | UNSUP | NUMBER
+	| UNX_COMP | RX_OVER | MALF_TLP | ECRC | UNSUP | INTERNAL | NUMBER
 	;
 
 cor_status_list: /* empty */			{ $$ = 0; }
 	| cor_status_list cor_status		{ $$ = $1 | $2; }
 	;
 
-cor_status: RCVR | BAD_TLP | BAD_DLLP | REP_ROLL | REP_TIMER | NUMBER
+cor_status: RCVR | BAD_TLP | BAD_DLLP | REP_ROLL | REP_TIMER | CINTERNAL | NUMBER
 	;
 
 %% 
-- 
2.34.1


[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux