On 10/25/24 2:02 PM, Terry Bowman wrote: > The AER service driver doesn't currently handle CXL protocol errors > reported by CXL root ports, CXL upstream switch ports, and CXL downstream > switch ports. Consequently, RAS protocol errors from CXL PCIe port devices > are not properly logged or handled. > > These errors are reported to the OS via the root port's AER correctable > and uncorrectable internal error fields. While the AER driver supports > handling downstream port protocol errors in restricted CXL host (RCH) mode > also known as CXL1.1, it lacks the same functionality for CXL PCIe ports > operating in virtual hierarchy (VH) mode. > > To address this gap, update the AER driver to handle CXL PCIe port device > protocol correctable errors (CE). > > Make this update alongside the existing downstream port RCH error handling > logic, extending support to CXL PCIe ports in VH mode. > > is_internal_error() is currently limited by CONFIG_PCIEAER_CXL kernel > config. Update is_internal_error()'s function declaration such that it is > always available regardless if CONFIG_PCIEAER_CXL kernel config is enabled > or disabled. > > The uncorrectable error (UCE) handling will be added in a future patch. > > [1] CXL 3.1 Spec, 12.2.2 CXL Root Ports, Downstream Switch Ports, and > Upstream Switch Ports > > Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> With the commit log update from what Jonathan suggested, Reviewed-by: Dave Jiang <dave.jiang@xxxxxxxxx> > --- > drivers/pci/pcie/aer.c | 59 ++++++++++++++++++++++++++++-------------- > 1 file changed, 39 insertions(+), 20 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 53e9a11f6c0f..1d3e5b929661 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -941,8 +941,15 @@ static bool find_source_device(struct pci_dev *parent, > return true; > } > > -#ifdef CONFIG_PCIEAER_CXL > +static bool is_internal_error(struct aer_err_info *info) > +{ > + if (info->severity == AER_CORRECTABLE) > + return info->status & PCI_ERR_COR_INTERNAL; > > + return info->status & PCI_ERR_UNC_INTN; > +} > + > +#ifdef CONFIG_PCIEAER_CXL > /** > * pci_aer_unmask_internal_errors - unmask internal errors > * @dev: pointer to the pcie_dev data structure > @@ -994,14 +1001,6 @@ static bool cxl_error_is_native(struct pci_dev *dev) > return (pcie_ports_native || host->native_aer); > } > > -static bool is_internal_error(struct aer_err_info *info) > -{ > - if (info->severity == AER_CORRECTABLE) > - return info->status & PCI_ERR_COR_INTERNAL; > - > - return info->status & PCI_ERR_UNC_INTN; > -} > - > static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) > { > struct aer_err_info *info = (struct aer_err_info *)data; > @@ -1033,14 +1032,23 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) > > static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info) > { > - /* > - * Internal errors of an RCEC indicate an AER error in an > - * RCH's downstream port. Check and handle them in the CXL.mem > - * device driver. > - */ > - if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && > - is_internal_error(info)) > + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC) > pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); > + > + if (info->severity == AER_CORRECTABLE) { > + struct pci_driver *pdrv = dev->driver; > + int aer = dev->aer_cap; > + > + if (aer) > + pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS, > + info->status); > + > + if (pdrv && pdrv->cxl_err_handler && > + pdrv->cxl_err_handler->cor_error_detected) > + pdrv->cxl_err_handler->cor_error_detected(dev); > + > + pcie_clear_device_status(dev); > + } > } > > static int handles_cxl_error_iter(struct pci_dev *dev, void *data) > @@ -1058,9 +1066,13 @@ static bool handles_cxl_errors(struct pci_dev *dev) > { > bool handles_cxl = false; > > - if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && > - pcie_aer_is_native(dev)) > + if (!pcie_aer_is_native(dev)) > + return false; > + > + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC) > pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl); > + else > + handles_cxl = pcie_is_cxl_port(dev); > > return handles_cxl; > } > @@ -1078,6 +1090,10 @@ static void cxl_enable_internal_errors(struct pci_dev *dev) > static inline void cxl_enable_internal_errors(struct pci_dev *dev) { } > static inline void cxl_handle_error(struct pci_dev *dev, > struct aer_err_info *info) { } > +static bool handles_cxl_errors(struct pci_dev *dev) > +{ > + return false; > +} > #endif > > /** > @@ -1115,8 +1131,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info) > > static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info) > { > - cxl_handle_error(dev, info); > - pci_aer_handle_error(dev, info); > + if (is_internal_error(info) && handles_cxl_errors(dev)) > + cxl_handle_error(dev, info); > + else > + pci_aer_handle_error(dev, info); > + > pci_dev_put(dev); > } >