Hi Daniel, Thanks for reviewing. Comments inline below. -matt > On Aug 7, 2015, at 12:12 AM, Daniel Axtens <dja@xxxxxxxxxx> wrote: > > Hi, > >> @@ -1857,9 +1884,18 @@ static int cxlflash_eh_device_reset_handler(struct scsi_cmnd *scp) >> get_unaligned_be32(&((u32 *)scp->cmnd)[2]), >> get_unaligned_be32(&((u32 *)scp->cmnd)[3])); >> >> - rcr = send_tmf(afu, scp, TMF_LUN_RESET); >> - if (unlikely(rcr)) >> - rc = FAILED; >> + switch (cfg->eeh_active) { >> + case EEH_STATE_NONE: >> + rcr = send_tmf(afu, scp, TMF_LUN_RESET); >> + if (unlikely(rcr)) >> + rc = FAILED; >> + break; >> + case EEH_STATE_ACTIVE: >> + wait_event(cfg->eeh_waitq, cfg->eeh_active != EEH_STATE_ACTIVE); >> + break; >> + case EEH_STATE_FAILED: >> + break; >> + } >> > > Looking at the context here, it looks like rc gets initalised to > SUCCESS. In that case, in the EEH failed case, you'll return SUCCESS. > I'm not particularly clear on the semantics here: does that make sense? > > Likewise, in the EEH active state, when you finish the wait_event, > should you check if the EEH state went to NONE or FAILED before you > break? > > There’s a similar case below in host_reset. Good catch, this is a bug we had previously identified. Look for a fix in v4. > >> pr_debug("%s: returning rc=%d\n", __func__, rc); >> return rc; >> @@ -1889,11 +1925,23 @@ static int cxlflash_eh_host_reset_handler(struct scsi_cmnd *scp) >> get_unaligned_be32(&((u32 *)scp->cmnd)[2]), >> get_unaligned_be32(&((u32 *)scp->cmnd)[3])); >> >> - rcr = afu_reset(cfg); >> - if (rcr == 0) >> - rc = SUCCESS; >> - else >> - rc = FAILED; >> + switch (cfg->eeh_active) { >> + case EEH_STATE_NONE: >> + cfg->eeh_active = EEH_STATE_FAILED; >> + rcr = afu_reset(cfg); >> + if (rcr == 0) >> + rc = SUCCESS; >> + else >> + rc = FAILED; >> + cfg->eeh_active = EEH_STATE_NONE; >> + wake_up_all(&cfg->eeh_waitq); >> + break; >> + case EEH_STATE_ACTIVE: >> + wait_event(cfg->eeh_waitq, cfg->eeh_active != EEH_STATE_ACTIVE); >> + break; >> + case EEH_STATE_FAILED: >> + break; >> + } >> >> pr_debug("%s: returning rc=%d\n", __func__, rc); >> return rc; >> @@ -2145,6 +2193,11 @@ static void cxlflash_worker_thread(struct work_struct *work) >> int port; >> ulong lock_flags; >> >> + /* Avoid MMIO if the device has failed */ >> + >> + if (cfg->eeh_active == EEH_STATE_FAILED) >> + return; >> + > Should this check be != EEH_STATE_NONE? Or is this called within the > error recovery process? Yes, we have already swapped the logic around as you pointed out. You’ll see it in v4. > >> spin_lock_irqsave(cfg->host->host_lock, lock_flags); >> >> if (cfg->lr_state == LINK_RESET_REQUIRED) { >> @@ -2226,6 +2279,8 @@ static int cxlflash_probe(struct pci_dev *pdev, >> >> cfg->init_state = INIT_STATE_NONE; >> cfg->dev = pdev; >> + >> + cfg->eeh_active = EEH_STATE_NONE; >> cfg->dev_id = (struct pci_device_id *)dev_id; >> >> >> @@ -2286,6 +2341,85 @@ out_remove: >> goto out; >> } >> >> +/** >> + * cxlflash_pci_error_detected() - called when a PCI error is detected >> + * @pdev: PCI device struct. >> + * @state: PCI channel state. >> + * >> + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT >> + */ >> +static pci_ers_result_t cxlflash_pci_error_detected(struct pci_dev *pdev, >> + pci_channel_state_t state) >> +{ >> + struct cxlflash_cfg *cfg = pci_get_drvdata(pdev); >> + >> + pr_debug("%s: pdev=%p state=%u\n", __func__, pdev, state); >> + >> + switch (state) { >> + case pci_channel_io_frozen: >> + cfg->eeh_active = EEH_STATE_ACTIVE; >> + udelay(100); >> + >> + term_mc(cfg, UNDO_START); >> + stop_afu(cfg); >> + >> + return PCI_ERS_RESULT_CAN_RECOVER; > > I think that should PCI_ERS_RESULT_NEED_RESET. Agreed. Will change for v4. > > Apart from that, it’s looking pretty good from an EEH perspective. Thanks. -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html