This is a note to let you know that I've just added the patch titled scsi: qla2xxx: Wind down adapter after PCIe error to the 5.18-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch and it can be found in the queue-5.18 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From d3117c83ba316b3200d9f2fe900f2b9a5525a25c Mon Sep 17 00:00:00 2001 From: Quinn Tran <qutran@xxxxxxxxxxx> Date: Wed, 15 Jun 2022 22:35:00 -0700 Subject: scsi: qla2xxx: Wind down adapter after PCIe error From: Quinn Tran <qutran@xxxxxxxxxxx> commit d3117c83ba316b3200d9f2fe900f2b9a5525a25c upstream. Put adapter into a wind down state if OS does not make any attempt to recover the adapter after PCIe error. Link: https://lore.kernel.org/r/20220616053508.27186-4-njavali@xxxxxxxxxxx Cc: stable@xxxxxxxxxxxxxxx Signed-off-by: Quinn Tran <qutran@xxxxxxxxxxx> Signed-off-by: Nilesh Javali <njavali@xxxxxxxxxxx> Signed-off-by: Martin K. Petersen <martin.petersen@xxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- drivers/scsi/qla2xxx/qla_bsg.c | 10 +++++++- drivers/scsi/qla2xxx/qla_def.h | 4 +++ drivers/scsi/qla2xxx/qla_init.c | 20 ++++++++++++++++ drivers/scsi/qla2xxx/qla_os.c | 48 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) --- a/drivers/scsi/qla2xxx/qla_bsg.c +++ b/drivers/scsi/qla2xxx/qla_bsg.c @@ -2975,6 +2975,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_ ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n", __func__, bsg_job); + + if (qla2x00_isp_reg_stat(ha)) { + ql_log(ql_log_info, vha, 0x9007, + "PCI/Register disconnect.\n"); + qla_pci_set_eeh_busy(vha); + } + /* find the bsg job from the active list of commands */ spin_lock_irqsave(&ha->hardware_lock, flags); for (que = 0; que < ha->max_req_queues; que++) { @@ -2992,7 +2999,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_ sp->u.bsg_job == bsg_job) { req->outstanding_cmds[cnt] = NULL; spin_unlock_irqrestore(&ha->hardware_lock, flags); - if (ha->isp_ops->abort_command(sp)) { + + if (!ha->flags.eeh_busy && ha->isp_ops->abort_command(sp)) { ql_log(ql_log_warn, vha, 0x7089, "mbx abort_command failed.\n"); bsg_reply->result = -EIO; --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -4048,6 +4048,9 @@ struct qla_hw_data { uint32_t n2n_fw_acc_sec:1; uint32_t plogi_template_valid:1; uint32_t port_isolated:1; + uint32_t eeh_flush:2; +#define EEH_FLUSH_RDY 1 +#define EEH_FLUSH_DONE 2 } flags; uint16_t max_exchg; @@ -4082,6 +4085,7 @@ struct qla_hw_data { uint32_t rsp_que_len; uint32_t req_que_off; uint32_t rsp_que_off; + unsigned long eeh_jif; /* Multi queue data structs */ device_reg_t *mqiobase; --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -47,6 +47,7 @@ qla2x00_sp_timeout(struct timer_list *t) { srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer); struct srb_iocb *iocb; + scsi_qla_host_t *vha = sp->vha; WARN_ON(irqs_disabled()); iocb = &sp->u.iocb_cmd; @@ -54,6 +55,12 @@ qla2x00_sp_timeout(struct timer_list *t) /* ref: TMR */ kref_put(&sp->cmd_kref, qla2x00_sp_release); + + if (vha && qla2x00_isp_reg_stat(vha->hw)) { + ql_log(ql_log_info, vha, 0x9008, + "PCI/Register disconnect.\n"); + qla_pci_set_eeh_busy(vha); + } } void qla2x00_sp_free(srb_t *sp) @@ -9702,6 +9709,12 @@ int qla2xxx_disable_port(struct Scsi_Hos vha->hw->flags.port_isolated = 1; + if (qla2x00_isp_reg_stat(vha->hw)) { + ql_log(ql_log_info, vha, 0x9006, + "PCI/Register disconnect, exiting.\n"); + qla_pci_set_eeh_busy(vha); + return FAILED; + } if (qla2x00_chip_is_down(vha)) return 0; @@ -9717,6 +9730,13 @@ int qla2xxx_enable_port(struct Scsi_Host { scsi_qla_host_t *vha = shost_priv(host); + if (qla2x00_isp_reg_stat(vha->hw)) { + ql_log(ql_log_info, vha, 0x9001, + "PCI/Register disconnect, exiting.\n"); + qla_pci_set_eeh_busy(vha); + return FAILED; + } + vha->hw->flags.port_isolated = 0; /* Set the flag to 1, so that isp_abort can proceed */ vha->flags.online = 1; --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -333,6 +333,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme, "To wait for ABTS response on I/O timeouts for NVMe. (default: 1)"); +u32 ql2xdelay_before_pci_error_handling = 5; +module_param(ql2xdelay_before_pci_error_handling, uint, 0644); +MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling, + "Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n"); + static void qla2x00_clear_drv_active(struct qla_hw_data *); static void qla2x00_free_device(scsi_qla_host_t *); static int qla2xxx_map_queues(struct Scsi_Host *shost); @@ -7258,6 +7263,44 @@ static void qla_heart_beat(struct scsi_q } } +static void qla_wind_down_chip(scsi_qla_host_t *vha) +{ + struct qla_hw_data *ha = vha->hw; + + if (!ha->flags.eeh_busy) + return; + if (ha->pci_error_state) + /* system is trying to recover */ + return; + + /* + * Current system is not handling PCIE error. At this point, this is + * best effort to wind down the adapter. + */ + if (time_after_eq(jiffies, ha->eeh_jif + ql2xdelay_before_pci_error_handling * HZ) && + !ha->flags.eeh_flush) { + ql_log(ql_log_info, vha, 0x9009, + "PCI Error detected, attempting to reset hardware.\n"); + + ha->isp_ops->reset_chip(vha); + ha->isp_ops->disable_intrs(ha); + + ha->flags.eeh_flush = EEH_FLUSH_RDY; + ha->eeh_jif = jiffies; + + } else if (ha->flags.eeh_flush == EEH_FLUSH_RDY && + time_after_eq(jiffies, ha->eeh_jif + 5 * HZ)) { + pci_clear_master(ha->pdev); + + /* flush all command */ + qla2x00_abort_isp_cleanup(vha); + ha->flags.eeh_flush = EEH_FLUSH_DONE; + + ql_log(ql_log_info, vha, 0x900a, + "PCI Error handling complete, all IOs aborted.\n"); + } +} + /************************************************************************** * qla2x00_timer * @@ -7281,6 +7324,8 @@ qla2x00_timer(struct timer_list *t) fc_port_t *fcport = NULL; if (ha->flags.eeh_busy) { + qla_wind_down_chip(vha); + ql_dbg(ql_dbg_timer, vha, 0x6000, "EEH = %d, restarting timer.\n", ha->flags.eeh_busy); @@ -7861,6 +7906,9 @@ void qla_pci_set_eeh_busy(struct scsi_ql spin_lock_irqsave(&base_vha->work_lock, flags); if (!ha->flags.eeh_busy) { + ha->eeh_jif = jiffies; + ha->flags.eeh_flush = 0; + ha->flags.eeh_busy = 1; do_cleanup = true; } Patches currently in stable-queue which might be from qutran@xxxxxxxxxxx are queue-5.18/scsi-qla2xxx-edif-synchronize-npiv-deletion-with-aut.patch queue-5.18/scsi-qla2xxx-fix-erroneous-mailbox-timeout-after-pci-error-injection.patch queue-5.18/scsi-qla2xxx-edif-add-bsg-interface-to-read-doorbell.patch queue-5.18/scsi-qla2xxx-edif-fix-no-logout-on-delete-for-n2n.patch queue-5.18/scsi-qla2xxx-edif-reduce-disruption-due-to-multiple-.patch queue-5.18/scsi-qla2xxx-edif-reduce-initiator-initiator-thrashi.patch queue-5.18/scsi-qla2xxx-edif-tear-down-session-if-keys-have-bee.patch queue-5.18/scsi-qla2xxx-edif-fix-no-login-after-app-start.patch queue-5.18/scsi-qla2xxx-edif-fix-n2n-login-retry-for-secure-dev.patch queue-5.18/scsi-qla2xxx-edif-fix-potential-stuck-session-in-sa-.patch queue-5.18/scsi-qla2xxx-edif-fix-session-thrash.patch queue-5.18/scsi-qla2xxx-edif-fix-n2n-discovery-issue-with-secur.patch queue-5.18/scsi-qla2xxx-zero-undefined-mailbox-in-registers.patch queue-5.18/scsi-qla2xxx-fix-imbalance-vha-vref_count.patch queue-5.18/scsi-qla2xxx-edif-bsg-refactor.patch queue-5.18/scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch queue-5.18/scsi-qla2xxx-edif-reduce-n2n-thrashing-at-app_start-.patch queue-5.18/scsi-qla2xxx-edif-send-logo-for-unexpected-ike-messa.patch queue-5.18/scsi-qla2xxx-edif-add-retry-for-els-passthrough.patch queue-5.18/scsi-qla2xxx-turn-off-multi-queue-for-8g-adapters.patch queue-5.18/scsi-qla2xxx-edif-wait-for-app-to-ack-on-sess-down.patch