On 2/11/25 12:24 PM, Terry Bowman wrote: > CXL RAS errors are currently logged using the associated CXL port's name > returned from devname(). They are typically named with 'port1', 'port2', > etc. to indicate the hierarchial location in the CXL topology. But, this > doesn't clearly indicate the CXL card or slot reporting the error. > > Update the logging to also log the corresponding PCIe devname. This will > give a PCIe SBDF or ACPI object name (in case of CXL HB). This will provide > details helping users understand which physical slot and card has the > error. > > Below is example output after making these changes. > > Correctable error example output: > cxl_port_aer_correctable_error: device=port1 (0000:0c:00.0) parent=root0 (pci0000:0c) status='Received Error From Physical Layer' > > Uncorrectable error example output: > cxl_port_aer_uncorrectable_error: device=port1 (0000:0c:00.0) parent=root0 (pci0000:0c) status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte Enable Parity Error' > > Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> I wonder if there's any benefit in identifying if the PCIe device is USP or DSP... Reviewed-by: Dave Jiang <dave.jiang@xxxxxxxxx> > --- > drivers/cxl/core/pci.c | 39 +++++++++++++++++++------------------ > drivers/cxl/core/trace.h | 42 +++++++++++++++++++++++++--------------- > 2 files changed, 46 insertions(+), 35 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 9a3090dae46a..f154dcf6dfda 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -652,14 +652,14 @@ void read_cdat_data(struct cxl_port *port) > } > EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); > > -static void __cxl_handle_cor_ras(struct device *dev, > +static void __cxl_handle_cor_ras(struct device *cxl_dev, struct device *pcie_dev, > void __iomem *ras_base) > { > void __iomem *addr; > u32 status; > > if (!ras_base) { > - dev_warn_once(dev, "CXL RAS register block is not mapped"); > + dev_warn_once(cxl_dev, "CXL RAS register block is not mapped"); > return; > } > > @@ -669,15 +669,15 @@ static void __cxl_handle_cor_ras(struct device *dev, > return; > writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > > - if (is_cxl_memdev(dev)) > - trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); > - else if (is_cxl_port(dev)) > - trace_cxl_port_aer_correctable_error(dev, status); > + if (is_cxl_memdev(cxl_dev)) > + trace_cxl_aer_correctable_error(to_cxl_memdev(cxl_dev), status); > + else if (is_cxl_port(cxl_dev)) > + trace_cxl_port_aer_correctable_error(cxl_dev, pcie_dev, status); > } > > static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) > { > - return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); > + return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, NULL, cxlds->regs.ras); > } > > /* CXL spec rev3.0 8.2.4.16.1 */ > @@ -701,7 +701,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log) > * Log the state of the RAS status registers and prepare them to log the > * next error status. Return 1 if reset needed. > */ > -static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base) > +static pci_ers_result_t __cxl_handle_ras(struct device *cxl_dev, struct device *pcie_dev, > + void __iomem *ras_base) > { > u32 hl[CXL_HEADERLOG_SIZE_U32]; > void __iomem *addr; > @@ -709,7 +710,7 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b > u32 fe; > > if (!ras_base) { > - dev_warn_once(dev, "CXL RAS register block is not mapped"); > + dev_warn_once(cxl_dev, "CXL RAS register block is not mapped"); > return PCI_ERS_RESULT_NONE; > } > > @@ -730,10 +731,10 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b > } > > header_log_copy(ras_base, hl); > - if (is_cxl_memdev(dev)) > - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); > - else if (is_cxl_port(dev)) > - trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl); > + if (is_cxl_memdev(cxl_dev)) > + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(cxl_dev), status, fe, hl); > + else if (is_cxl_port(cxl_dev)) > + trace_cxl_port_aer_uncorrectable_error(cxl_dev, pcie_dev, status, fe, hl); > > writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); > > @@ -742,7 +743,7 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b > > static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) > { > - return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); > + return __cxl_handle_ras(&cxlds->cxlmd->dev, NULL, cxlds->regs.ras); > } > > #ifdef CONFIG_PCIEAER_CXL > @@ -814,7 +815,7 @@ static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev) > struct cxl_dport *dport = NULL; > > port = find_cxl_port(&pdev->dev, &dport); > - if (!port) { > + if (!port || !is_cxl_port(&port->dev)) { > pci_err(pdev, "Failed to find root/dport in CXL topology\n"); > return NULL; > } > @@ -848,7 +849,7 @@ static void cxl_port_cor_error_detected(struct pci_dev *pdev) > struct device *dev; > void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev); > > - __cxl_handle_cor_ras(dev, ras_base); > + __cxl_handle_cor_ras(dev, &pdev->dev, ras_base); > } > > static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev) > @@ -856,7 +857,7 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev) > struct device *dev; > void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev); > > - return __cxl_handle_ras(dev, ras_base); > + return __cxl_handle_ras(dev, &pdev->dev, ras_base); > } > > void cxl_uport_init_ras_reporting(struct cxl_port *port) > @@ -909,13 +910,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); > static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, > struct cxl_dport *dport) > { > - return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras); > + return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, NULL, dport->regs.ras); > } > > static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, > struct cxl_dport *dport) > { > - return __cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras); > + return __cxl_handle_ras(&cxlds->cxlmd->dev, NULL, dport->regs.ras); > } > > /* > diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h > index b536233ac210..a74803f4aa22 100644 > --- a/drivers/cxl/core/trace.h > +++ b/drivers/cxl/core/trace.h > @@ -49,18 +49,22 @@ > ) > > TRACE_EVENT(cxl_port_aer_uncorrectable_error, > - TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), > - TP_ARGS(dev, status, fe, hl), > + TP_PROTO(struct device *cxl_dev, struct device *pcie_dev, u32 status, u32 fe, u32 *hl), > + TP_ARGS(cxl_dev, pcie_dev, status, fe, hl), > TP_STRUCT__entry( > - __string(devname, dev_name(dev)) > - __string(parent, dev_name(dev->parent)) > + __string(cxl_name, dev_name(cxl_dev)) > + __string(cxl_parent_name, dev_name(cxl_dev->parent)) > + __string(pcie_name, dev_name(pcie_dev)) > + __string(pcie_parent_name, dev_name(pcie_dev->parent)) > __field(u32, status) > __field(u32, first_error) > __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) > ), > TP_fast_assign( > - __assign_str(devname); > - __assign_str(parent); > + __assign_str(cxl_name); > + __assign_str(cxl_parent_name); > + __assign_str(pcie_name); > + __assign_str(pcie_parent_name); > __entry->status = status; > __entry->first_error = fe; > /* > @@ -69,8 +73,9 @@ TRACE_EVENT(cxl_port_aer_uncorrectable_error, > */ > memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); > ), > - TP_printk("device=%s parent=%s status: '%s' first_error: '%s'", > - __get_str(devname), __get_str(parent), > + TP_printk("device=%s (%s) parent=%s (%s) status: '%s' first_error: '%s'", > + __get_str(cxl_name), __get_str(pcie_name), > + __get_str(cxl_parent_name), __get_str(pcie_parent_name), > show_uc_errs(__entry->status), > show_uc_errs(__entry->first_error) > ) > @@ -125,20 +130,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, > ) > > TRACE_EVENT(cxl_port_aer_correctable_error, > - TP_PROTO(struct device *dev, u32 status), > - TP_ARGS(dev, status), > + TP_PROTO(struct device *cxl_dev, struct device *pcie_dev, u32 status), > + TP_ARGS(cxl_dev, pcie_dev, status), > TP_STRUCT__entry( > - __string(devname, dev_name(dev)) > - __string(parent, dev_name(dev->parent)) > + __string(cxl_name, dev_name(cxl_dev)) > + __string(cxl_parent_name, dev_name(cxl_dev->parent)) > + __string(pcie_name, dev_name(pcie_dev)) > + __string(pcie_parent_name, dev_name(pcie_dev->parent)) > __field(u32, status) > ), > TP_fast_assign( > - __assign_str(devname); > - __assign_str(parent); > + __assign_str(cxl_name); > + __assign_str(cxl_parent_name); > + __assign_str(pcie_name); > + __assign_str(pcie_parent_name); > __entry->status = status; > ), > - TP_printk("device=%s parent=%s status='%s'", > - __get_str(devname), __get_str(parent), > + TP_printk("device=%s (%s) parent=%s (%s) status='%s'", > + __get_str(cxl_name), __get_str(pcie_name), > + __get_str(cxl_parent_name), __get_str(pcie_parent_name), > show_ce_errs(__entry->status) > ) > );