Add following fields in aer_event to better understand Advisory Non-Fatal and other errors for external observation: - cor_status (Correctable Error Status) - cor_mask (Correctable Error Mask) - uncor_status (Uncorrectable Error Status) - uncor_severity (Uncorrectable Error Severity) - uncor_mask (Uncorrectable Error Mask) - aer_cap_ctrl (AER Capabilities and Control) - link_status (Link Status) - device_status (Device Status) - device_control_2 (Device Control 2) In addition to the raw register value, value of following fields are extracted and logged for better observability: - "First Error Pointer" and "Completion Timeout Prefix/Header Log Capable" from "AER Capabilities and Control" - "Completion Timeout Value" and "Completion Timeout Disable" from "Device Control 2" Signed-off-by: "Wang, Qingshun" <qingshun.wang@xxxxxxxxxxxxxxx> --- drivers/pci/pcie/aer.c | 17 +++++++++++-- include/ras/ras_event.h | 48 ++++++++++++++++++++++++++++++++--- include/uapi/linux/pci_regs.h | 1 + 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index eec3406f727a..2f5639f6c40f 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -757,6 +757,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) int layer, agent; int id = pci_dev_id(dev); const char *level; + struct aer_capability_regs aer_caps; if (info->severity == AER_CORRECTABLE) { status = info->cor_status; @@ -793,8 +794,18 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) if (info->id && info->error_dev_num > 1 && info->id == id) pci_err(dev, " Error of this Agent is reported first\n"); + aer_caps = (struct aer_capability_regs) { + .cor_status = info->cor_status, + .cor_mask = info->cor_mask, + .uncor_status = info->uncor_status, + .uncor_severity = info->uncor_severity, + .uncor_mask = info->uncor_mask, + .cap_control = info->aer_cap_ctrl + }; trace_aer_event(dev_name(&dev->dev), (status & ~mask), - info->severity, info->tlp_header_valid, &info->tlp); + info->severity, info->tlp_header_valid, &info->tlp, + &aer_caps, info->link_status, + info->device_status, info->device_control_2); } static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) @@ -870,7 +881,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, __print_tlp_header(dev, &aer->header_log); trace_aer_event(dev_name(&dev->dev), (status & ~mask), - aer_severity, tlp_header_valid, &aer->header_log); + aer_severity, tlp_header_valid, &aer->header_log, + aer, info.link_status, + info.device_status, info.device_control_2); } EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL); diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index cbd3ddd7c33d..a94997073d90 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -300,9 +300,14 @@ TRACE_EVENT(aer_event, const u32 status, const u8 severity, const u8 tlp_header_valid, - struct aer_header_log_regs *tlp), + struct aer_header_log_regs *tlp, + struct aer_capability_regs *aer_caps, + const u16 link_status, + const u16 device_status, + const u16 device_control_2), - TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), + TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp, + aer_caps, link_status, device_status, device_control_2), TP_STRUCT__entry( __string( dev_name, dev_name ) @@ -310,6 +315,10 @@ TRACE_EVENT(aer_event, __field( u8, severity ) __field( u8, tlp_header_valid) __array( u32, tlp_header, 4 ) + __field_struct(struct aer_capability_regs, aer_caps) + __field( u16, link_status ) + __field( u16, device_status ) + __field( u16, device_control_2) ), TP_fast_assign( @@ -317,6 +326,10 @@ TRACE_EVENT(aer_event, __entry->status = status; __entry->severity = severity; __entry->tlp_header_valid = tlp_header_valid; + __entry->aer_caps = *aer_caps; + __entry->link_status = link_status; + __entry->device_status = device_status; + __entry->device_control_2 = device_control_2; if (tlp_header_valid) { __entry->tlp_header[0] = tlp->dw0; __entry->tlp_header[1] = tlp->dw1; @@ -325,7 +338,20 @@ TRACE_EVENT(aer_event, } ), - TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n", + TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s, " + "Correctable Error Status=0x%08x, " + "Correctable Error Mask=0x%08x, " + "Uncorrectable Error Status=0x%08x, " + "Uncorrectable Error Severity=0x%08x, " + "Uncorrectable Error Mask=0x%08x, " + "AER Capability and Control=0x%08x, " + "First Error Pointer=0x%x, " + "Completion Timeout Prefix/Header Log Capable=%s, " + "Link Status=0x%04x, " + "Device Status=0x%04x, " + "Device Control 2=0x%04x, " + "Completion Timeout Value=0x%x, " + "Completion Timeout Disable=%sn", __get_str(dev_name), __entry->severity == AER_CORRECTABLE ? "Corrected" : __entry->severity == AER_FATAL ? @@ -335,7 +361,21 @@ TRACE_EVENT(aer_event, __print_flags(__entry->status, "|", aer_uncorrectable_errors), __entry->tlp_header_valid ? __print_array(__entry->tlp_header, 4, 4) : - "Not available") + "Not available", + __entry->aer_caps.cor_status, + __entry->aer_caps.cor_mask, + __entry->aer_caps.uncor_status, + __entry->aer_caps.uncor_severity, + __entry->aer_caps.uncor_mask, + __entry->aer_caps.cap_control, + PCI_ERR_CAP_FEP(__entry->aer_caps.cap_control), + __entry->aer_caps.cap_control & PCI_ERR_CAP_CTO_LOGC ? "True" : "False", + __entry->link_status, + __entry->device_status, + __entry->device_control_2, + __entry->device_control_2 & PCI_EXP_DEVCTL2_COMP_TIMEOUT, + __entry->device_control_2 & PCI_EXP_DEVCTL2_COMP_TMOUT_DIS ? + "True" : "False") ); /* diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index a39193213ff2..54160ed2a8c9 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -787,6 +787,7 @@ #define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ +#define PCI_ERR_CAP_CTO_LOGC 0x00001000 /* Completion Timeout Prefix/Header Log Capable */ #define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ #define PCI_ERR_ROOT_COMMAND 0x2c /* Root Error Command */ #define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */ -- 2.42.0