If we are processing an Advisory Non-Fatal Error, first check the Device Status. If any of Fatal/Non-Fatal Error Detected bits is set, leave it to uncorrectable error handler to clear the UE status bit, which should be executed right after the CE handler in this case. Otherwise, filter out uncorrectable errors that is not possible to trigger an Advisory Non-Fatal Error, then clear all the rest status bits. Reviewed-by: "Tsaur, Erwin" <erwin.tsaur@xxxxxxxxx> Signed-off-by: "Wang, Qingshun" <qingshun.wang@xxxxxxxxxxxxxxx> --- drivers/pci/pcie/aer.c | 58 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 9311323a2391..86e7cfd71f23 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -107,6 +107,12 @@ struct aer_stats { PCI_ERR_ROOT_MULTI_COR_RCV | \ PCI_ERR_ROOT_MULTI_UNCOR_RCV) +#define AER_ERR_ANFE_UNC_MASK (PCI_ERR_UNC_POISON_TLP | \ + PCI_ERR_UNC_COMP_TIME | \ + PCI_ERR_UNC_COMP_ABORT | \ + PCI_ERR_UNC_UNX_COMP | \ + PCI_ERR_UNC_UNSUP) + static int pcie_aer_disable; static pci_ers_result_t aer_root_reset(struct pci_dev *dev); @@ -612,6 +618,29 @@ const struct attribute_group aer_stats_attr_group = { .is_visible = aer_stats_attrs_are_visible, }; +static int anfe_get_related_err(struct aer_err_info *info) +{ + /* + * Take the most conservative route here. If there are + * Non-Fatal/Fatal errors detected, do not assume any + * bit in uncor_status is set by ANFE. + */ + if (info->device_status & (PCI_EXP_DEVSTA_NFED | PCI_EXP_DEVSTA_FED)) + return 0; + /* + * An UNCOR error may cause Advisory Non-Fatal error if: + * a. The severity of the error is Non-Fatal. + * b. The error is one of the following: + * 1. Poisoned TLP + * 2. Completion Timeout + * 3. Completer Abort + * 4. Unexpected Completion + * 5. Unsupported Request + */ + return info->uncor_status & ~info->uncor_mask + & AER_ERR_ANFE_UNC_MASK & ~info->severity; +} + static void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info) { @@ -678,6 +707,7 @@ static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { unsigned long status; + unsigned long anfe_status; const char **strings; const char *level, *errmsg; int i; @@ -700,6 +730,21 @@ static void __aer_print_error(struct pci_dev *dev, pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, info->first_error == i ? " (First)" : ""); } + + if (info->severity == AER_CORRECTABLE && (status & PCI_ERR_COR_ADV_NFAT)) { + anfe_status = anfe_get_related_err(info); + if (anfe_status) { + pci_printk(level, dev, "Uncorrectable errors that may cause Advisory Non-Fatal:"); + for_each_set_bit(i, &anfe_status, 32) { + errmsg = aer_uncorrectable_error_string[i]; + if (!errmsg) + errmsg = "Unknown Error Bit"; + + pci_printk(level, dev, " [%2d] %-22s\n", i, errmsg); + } + } + } + pci_dev_aer_stats_incr(dev, info); } @@ -1092,6 +1137,14 @@ static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { } #endif +static void handle_advisory_nonfatal(struct pci_dev *dev, struct aer_err_info *info) +{ + int aer = dev->aer_cap; + + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, + anfe_get_related_err(info)); +} + /** * pci_aer_handle_error - handle logging error into an event log * @dev: pointer to pci_dev data structure of error source device @@ -1108,9 +1161,12 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info) * Correctable error does not need software intervention. * No need to go through error recovery process. */ - if (aer) + if (aer) { pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS, info->cor_status); + if (info->cor_status & PCI_ERR_COR_ADV_NFAT) + handle_advisory_nonfatal(dev, info); + } if (pcie_aer_is_native(dev)) { struct pci_driver *pdrv = dev->driver; -- 2.42.0