The patch [1] PCI/ERR: Run error recovery callbacks for all affected devices have broken the non-fatal error handling logic in patch [2]. For non-fatal error, link is reliable, so no need to reset link, handle non-fatal error for all subordinates seems incorrect. Restore the non-fatal errors process logic. [1] PCI/ERR: Run error recovery callbacks for all affected devices #4.20 https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bfcb79fca19d267712e425af1dd48812c40dec0c [2] PCI/AER: Report non-fatal errors only to the affected endpoint #4.15 https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v5.0-rc2&id=86acc790717fb60fb51ea3095084e331d8711c74 Fixes: bfcb79fca19d ("PCI/ERR: Run error recovery callbacks for all affected devices") Reported-by: Xiaofei Tan <tanxiaofei@xxxxxxxxxx> Signed-off-by: Dongdong Liu <liudongdong3@xxxxxxxxxx> Cc: Keith Busch <keith.busch@xxxxxxxxx> Cc: Bjorn Helgaas <bhelgaas@xxxxxxxxxx> --- drivers/pci/pcie/err.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 773197a..9de3880 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -187,7 +187,8 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, u32 service) { pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; - struct pci_bus *bus; + struct pci_bus *bus = dev->bus; + struct pci_dev *bridge = dev; /* * Error recovery runs on all subordinates of the first downstream port. @@ -195,23 +196,33 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, */ if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)) - dev = dev->bus->self; - bus = dev->subordinate; + bridge = bus->self; + + if (bridge) + bus = bridge->subordinate; pci_dbg(dev, "broadcast error_detected message\n"); if (state == pci_channel_io_frozen) pci_walk_bus(bus, report_frozen_detected, &status); - else - pci_walk_bus(bus, report_normal_detected, &status); + else { + if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) + report_normal_detected(dev, &status); + else + pci_walk_bus(bus, report_normal_detected, &status); + } if (state == pci_channel_io_frozen && - reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED) + reset_link(bridge, service) != PCI_ERS_RESULT_RECOVERED) goto failed; if (status == PCI_ERS_RESULT_CAN_RECOVER) { status = PCI_ERS_RESULT_RECOVERED; pci_dbg(dev, "broadcast mmio_enabled message\n"); - pci_walk_bus(bus, report_mmio_enabled, &status); + if (state == pci_channel_io_normal && + dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) + report_mmio_enabled(dev, &status); + else + pci_walk_bus(bus, report_mmio_enabled, &status); } if (status == PCI_ERS_RESULT_NEED_RESET) { @@ -222,14 +233,22 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, */ status = PCI_ERS_RESULT_RECOVERED; pci_dbg(dev, "broadcast slot_reset message\n"); - pci_walk_bus(bus, report_slot_reset, &status); + if (state == pci_channel_io_normal && + dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) + report_slot_reset(dev, &status); + else + pci_walk_bus(bus, report_slot_reset, &status); } if (status != PCI_ERS_RESULT_RECOVERED) goto failed; pci_dbg(dev, "broadcast resume message\n"); - pci_walk_bus(bus, report_resume, &status); + if (state == pci_channel_io_normal && + dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) + report_resume(dev, &status); + else + pci_walk_bus(bus, report_resume, &status); pci_aer_clear_device_status(dev); pci_cleanup_aer_uncorrect_error_status(dev); -- 1.9.1