On 2/11/25 12:24 PM, Terry Bowman wrote: > Existing recovery procedure for PCIe uncorrectable errors (UCE) does not > apply to CXL devices. Recovery can not be used for CXL devices because of > potential corruption on what can be system memory. Also, current PCIe UCE > recovery, in the case of a Root Port (RP) or Downstream Switch Port (DSP), > does not begin at the RP/DSP but begins at the first downstream device. > This will miss handling CXL Protocol Errors in a CXL RP or DSP. A separate > CXL recovery is needed because of the different handling requirements > > Add a new function, cxl_do_recovery() using the following. > > Add cxl_walk_bridge() to iterate the detected error's sub-topology. > cxl_walk_bridge() is similar to pci_walk_bridge() but the CXL flavor > will begin iteration at the RP or DSP rather than beginning at the > first downstream device. > > pci_walk_bridge() is candidate to possibly reuse cxl_walk_bridge() but > needs further investigation. This will be left for future improvement > to make the CXL and PCI handling paths more common. > > Add cxl_report_error_detected() as an analog to report_error_detected(). > It will call pci_driver::cxl_err_handlers for each iterated downstream > device. The pci_driver::cxl_err_handler's UCE handler returns a boolean > indicating if there was a UCE error detected during handling. > > cxl_do_recovery() uses the status from cxl_report_error_detected() to > determine how to proceed. Non-fatal CXL UCE errors will be treated as > fatal. If a UCE was present during handling then cxl_do_recovery() > will kernel panic. > > Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> Reviewed-by: Dave Jiang <dave.jiang@xxxxxxxxx> > --- > drivers/pci/pci.h | 3 +++ > drivers/pci/pcie/aer.c | 4 +++ > drivers/pci/pcie/err.c | 58 ++++++++++++++++++++++++++++++++++++++++++ > include/linux/pci.h | 3 +++ > 4 files changed, 68 insertions(+) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 01e51db8d285..deb193b387af 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -722,6 +722,9 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, > pci_channel_state_t state, > pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)); > > +/* CXL error reporting and handling */ > +void cxl_do_recovery(struct pci_dev *dev); > + > bool pcie_wait_for_link(struct pci_dev *pdev, bool active); > int pcie_retrain_link(struct pci_dev *pdev, bool use_lt); > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 34ec0958afff..ee38db08d005 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -1012,6 +1012,8 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) > err_handler->error_detected(dev, pci_channel_io_normal); > else if (info->severity == AER_FATAL) > err_handler->error_detected(dev, pci_channel_io_frozen); > + > + cxl_do_recovery(dev); > } > out: > device_unlock(&dev->dev); > @@ -1041,6 +1043,8 @@ static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info) > pdrv->cxl_err_handler->cor_error_detected(dev); > > pcie_clear_device_status(dev); > + } else { > + cxl_do_recovery(dev); > } > } > > diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c > index 31090770fffc..05f2d1ef4c36 100644 > --- a/drivers/pci/pcie/err.c > +++ b/drivers/pci/pcie/err.c > @@ -24,6 +24,9 @@ > static pci_ers_result_t merge_result(enum pci_ers_result orig, > enum pci_ers_result new) > { > + if (new == PCI_ERS_RESULT_PANIC) > + return PCI_ERS_RESULT_PANIC; > + > if (new == PCI_ERS_RESULT_NO_AER_DRIVER) > return PCI_ERS_RESULT_NO_AER_DRIVER; > > @@ -276,3 +279,58 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, > > return status; > } > + > +static void cxl_walk_bridge(struct pci_dev *bridge, > + int (*cb)(struct pci_dev *, void *), > + void *userdata) > +{ > + if (cb(bridge, userdata)) > + return; > + > + if (bridge->subordinate) > + pci_walk_bus(bridge->subordinate, cb, userdata); > +} > + > +static int cxl_report_error_detected(struct pci_dev *dev, void *data) > +{ > + const struct cxl_error_handlers *cxl_err_handler; > + pci_ers_result_t vote, *result = data; > + struct pci_driver *pdrv; > + > + device_lock(&dev->dev); > + pdrv = dev->driver; > + if (!pdrv || !pdrv->cxl_err_handler || > + !pdrv->cxl_err_handler->error_detected) > + goto out; > + > + cxl_err_handler = pdrv->cxl_err_handler; > + vote = cxl_err_handler->error_detected(dev); > + *result = merge_result(*result, vote); > +out: > + device_unlock(&dev->dev); > + return 0; > +} > + > +void cxl_do_recovery(struct pci_dev *dev) > +{ > + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); > + pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; > + > + cxl_walk_bridge(dev, cxl_report_error_detected, &status); > + if (status == PCI_ERS_RESULT_PANIC) > + panic("CXL cachemem error."); > + > + /* > + * If we have native control of AER, clear error status in the device > + * that detected the error. If the platform retained control of AER, > + * it is responsible for clearing this status. In that case, the > + * signaling device may not even be visible to the OS. > + */ > + if (host->native_aer || pcie_ports_native) { > + pcie_clear_device_status(dev); > + pci_aer_clear_nonfatal_status(dev); > + pci_aer_clear_fatal_status(dev); > + } > + > + pci_info(dev, "CXL uncorrectable error.\n"); > +} > diff --git a/include/linux/pci.h b/include/linux/pci.h > index 82a0401c58d3..5b539b5bf0d1 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -864,6 +864,9 @@ enum pci_ers_result { > > /* No AER capabilities registered for the driver */ > PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6, > + > + /* System is unstable, panic */ > + PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7, > }; > > /* PCI bus error event callbacks */