On Thu, Sep 19, 2013 at 4:44 PM, Sujit Reddy Thumma <sthumma@xxxxxxxxxxxxxx> wrote: > As of now SCSI initiated error handling is broken because, > the reset APIs don't try to bring back the device initialized and > ready for further transfers. > > In case of timeouts, the scsi error handler takes care of handling aborts > and resets. Improve the error handling in such scenario by resetting the > device and host and re-initializing them in proper manner. > > Signed-off-by: Sujit Reddy Thumma <sthumma@xxxxxxxxxxxxxx> > Reviewed-by: Yaniv Gardi <ygardi@xxxxxxxxxxxxxx> > Tested-by: Dolev Raviv <draviv@xxxxxxxxxxxxxx> > --- > drivers/scsi/ufs/ufshcd.c | 240 ++++++++++++++++++++++++++++++++++++---------- > drivers/scsi/ufs/ufshcd.h | 2 + > 2 files changed, 189 insertions(+), 53 deletions(-) > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c > index 52f66e4..5462310 100644 > --- a/drivers/scsi/ufs/ufshcd.c > +++ b/drivers/scsi/ufs/ufshcd.c > @@ -74,9 +74,14 @@ enum { > > /* UFSHCD states */ > enum { > - UFSHCD_STATE_OPERATIONAL, > UFSHCD_STATE_RESET, > UFSHCD_STATE_ERROR, > + UFSHCD_STATE_OPERATIONAL, > +}; > + > +/* UFSHCD error handling flags */ > +enum { > + UFSHCD_EH_IN_PROGRESS = (1 << 0), > }; > > /* Interrupt configuration options */ > @@ -86,6 +91,16 @@ enum { > UFSHCD_INT_CLEAR, > }; > > +#define ufshcd_set_eh_in_progress(h) \ > + (h->eh_flags |= UFSHCD_EH_IN_PROGRESS) > +#define ufshcd_eh_in_progress(h) \ > + (h->eh_flags & UFSHCD_EH_IN_PROGRESS) > +#define ufshcd_clear_eh_in_progress(h) \ > + (h->eh_flags &= ~UFSHCD_EH_IN_PROGRESS) > + > +static void ufshcd_tmc_handler(struct ufs_hba *hba); > +static void ufshcd_async_scan(void *data, async_cookie_t cookie); > + > /* > * ufshcd_wait_for_register - wait for register value to change > * @hba - per-adapter interface > @@ -856,10 +871,25 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) > > tag = cmd->request->tag; > > - if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) { > + spin_lock_irqsave(hba->host->host_lock, flags); > + switch (hba->ufshcd_state) { > + case UFSHCD_STATE_OPERATIONAL: > + break; > + case UFSHCD_STATE_RESET: > err = SCSI_MLQUEUE_HOST_BUSY; > - goto out; > + goto out_unlock; > + case UFSHCD_STATE_ERROR: > + set_host_byte(cmd, DID_ERROR); > + cmd->scsi_done(cmd); > + goto out_unlock; > + default: > + dev_WARN_ONCE(hba->dev, 1, "%s: invalid state %d\n", > + __func__, hba->ufshcd_state); > + set_host_byte(cmd, DID_BAD_TARGET); > + cmd->scsi_done(cmd); > + goto out_unlock; > } > + spin_unlock_irqrestore(hba->host->host_lock, flags); > > /* acquire the tag to make sure device cmds don't use it */ > if (test_and_set_bit_lock(tag, &hba->lrb_in_use)) { > @@ -896,6 +926,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) > /* issue command to the controller */ > spin_lock_irqsave(hba->host->host_lock, flags); > ufshcd_send_command(hba, tag); > +out_unlock: > spin_unlock_irqrestore(hba->host->host_lock, flags); > out: > return err; > @@ -1707,8 +1738,6 @@ static int ufshcd_make_hba_operational(struct ufs_hba *hba) > if (hba->ufshcd_state == UFSHCD_STATE_RESET) > scsi_unblock_requests(hba->host); > > - hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL; > - > out: > return err; > } > @@ -2455,8 +2484,12 @@ static void ufshcd_err_handler(struct ufs_hba *hba) > } > return; > fatal_eh: > - hba->ufshcd_state = UFSHCD_STATE_ERROR; > - schedule_work(&hba->feh_workq); > + /* handle fatal errors only when link is functional */ > + if (hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL) { > + /* block commands at driver layer until error is handled */ > + hba->ufshcd_state = UFSHCD_STATE_ERROR; > + schedule_work(&hba->feh_workq); > + } > } > > /** > @@ -2621,12 +2654,13 @@ static int ufshcd_issue_tm_cmd(struct ufs_hba *hba, int lun_id, int task_id, > } > > /** > - * ufshcd_device_reset - reset device and abort all the pending commands > + * ufshcd_eh_device_reset_handler - device reset handler registered to > + * scsi layer. > * @cmd: SCSI command pointer > * > * Returns SUCCESS/FAILED > */ > -static int ufshcd_device_reset(struct scsi_cmnd *cmd) > +static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) > { > struct Scsi_Host *host; > struct ufs_hba *hba; > @@ -2635,6 +2669,7 @@ static int ufshcd_device_reset(struct scsi_cmnd *cmd) > int err; > u8 resp = 0xF; > struct ufshcd_lrb *lrbp; > + unsigned long flags; > > host = cmd->device->host; > hba = shost_priv(host); > @@ -2643,55 +2678,33 @@ static int ufshcd_device_reset(struct scsi_cmnd *cmd) > lrbp = &hba->lrb[tag]; > err = ufshcd_issue_tm_cmd(hba, lrbp->lun, 0, UFS_LOGICAL_RESET, &resp); > if (err || resp != UPIU_TASK_MANAGEMENT_FUNC_COMPL) { > - err = FAILED; > + if (!err) > + err = resp; > goto out; > - } else { > - err = SUCCESS; > } > > - for (pos = 0; pos < hba->nutrs; pos++) { > - if (test_bit(pos, &hba->outstanding_reqs) && > - (hba->lrb[tag].lun == hba->lrb[pos].lun)) { > - > - /* clear the respective UTRLCLR register bit */ > - ufshcd_utrl_clear(hba, pos); > - > - clear_bit(pos, &hba->outstanding_reqs); > - > - if (hba->lrb[pos].cmd) { > - scsi_dma_unmap(hba->lrb[pos].cmd); > - hba->lrb[pos].cmd->result = > - DID_ABORT << 16; > - hba->lrb[pos].cmd->scsi_done(cmd); > - hba->lrb[pos].cmd = NULL; > - clear_bit_unlock(pos, &hba->lrb_in_use); > - wake_up(&hba->dev_cmd.tag_wq); > - } > + /* clear the commands that were pending for corresponding LUN */ > + for_each_set_bit(pos, &hba->outstanding_reqs, hba->nutrs) { > + if (hba->lrb[pos].lun == lrbp->lun) { > + err = ufshcd_clear_cmd(hba, pos); > + if (err) > + break; > } > - } /* end of for */ > + } > + spin_lock_irqsave(host->host_lock, flags); > + ufshcd_transfer_req_compl(hba); > + spin_unlock_irqrestore(host->host_lock, flags); > out: > + if (!err) { > + err = SUCCESS; > + } else { > + dev_err(hba->dev, "%s: failed with err %d\n", __func__, err); > + err = FAILED; > + } > return err; > } > > /** > - * ufshcd_host_reset - Main reset function registered with scsi layer > - * @cmd: SCSI command pointer > - * > - * Returns SUCCESS/FAILED > - */ > -static int ufshcd_host_reset(struct scsi_cmnd *cmd) > -{ > - struct ufs_hba *hba; > - > - hba = shost_priv(cmd->device->host); > - > - if (hba->ufshcd_state == UFSHCD_STATE_RESET) > - return SUCCESS; > - > - return ufshcd_do_reset(hba); > -} > - > -/** > * ufshcd_abort - abort a specific command > * @cmd: SCSI command pointer > * > @@ -2789,6 +2802,122 @@ out: > } > > /** > + * ufshcd_host_reset_and_restore - reset and restore host controller > + * @hba: per-adapter instance > + * > + * Note that host controller reset may issue DME_RESET to > + * local and remote (device) Uni-Pro stack and the attributes > + * are reset to default state. > + * > + * Returns zero on success, non-zero on failure > + */ > +static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) > +{ > + int err; > + async_cookie_t cookie; > + unsigned long flags; > + > + /* Reset the host controller */ > + spin_lock_irqsave(hba->host->host_lock, flags); > + ufshcd_hba_stop(hba); > + spin_unlock_irqrestore(hba->host->host_lock, flags); > + > + err = ufshcd_hba_enable(hba); > + if (err) > + goto out; > + > + /* Establish the link again and restore the device */ > + cookie = async_schedule(ufshcd_async_scan, hba); > + /* wait for async scan to be completed */ > + async_synchronize_cookie(++cookie); > + if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) > + err = -EIO; > +out: > + if (err) > + dev_err(hba->dev, "%s: Host init failed %d\n", __func__, err); > + > + return err; > +} > + > +/** > + * ufshcd_reset_and_restore - reset and re-initialize host/device > + * @hba: per-adapter instance > + * > + * Reset and recover device, host and re-establish link. This > + * is helpful to recover the communication in fatal error conditions. > + * > + * Returns zero on success, non-zero on failure > + */ > +static int ufshcd_reset_and_restore(struct ufs_hba *hba) > +{ > + int err = 0; > + unsigned long flags; > + > + err = ufshcd_host_reset_and_restore(hba); > + > + /* > + * After reset the door-bell might be cleared, complete > + * outstanding requests in s/w here. > + */ > + spin_lock_irqsave(hba->host->host_lock, flags); > + ufshcd_transfer_req_compl(hba); > + ufshcd_tmc_handler(hba); > + spin_unlock_irqrestore(hba->host->host_lock, flags); > + > + return err; > +} > + > +/** > + * ufshcd_eh_host_reset_handler - host reset handler registered to scsi layer > + * @cmd - SCSI command pointer > + * > + * Returns SUCCESS/FAILED > + */ > +static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd) > +{ > + int err; > + unsigned long flags; > + struct ufs_hba *hba; > + > + hba = shost_priv(cmd->device->host); > + > + /* > + * Check if there is any race with fatal error handling. > + * If so, wait for it to complete. Even though fatal error > + * handling does reset and restore in some cases, don't assume > + * anything out of it. We are just avoiding race here. > + */ > + do { > + spin_lock_irqsave(hba->host->host_lock, flags); > + if (!(work_pending(&hba->feh_workq) || > + hba->ufshcd_state == UFSHCD_STATE_RESET)) > + break; > + spin_unlock_irqrestore(hba->host->host_lock, flags); > + dev_dbg(hba->dev, "%s: reset in progress\n", __func__); > + flush_work(&hba->feh_workq); > + } while (1); > + > + hba->ufshcd_state = UFSHCD_STATE_RESET; > + ufshcd_set_eh_in_progress(hba); > + spin_unlock_irqrestore(hba->host->host_lock, flags); > + > + err = ufshcd_reset_and_restore(hba); > + > + spin_lock_irqsave(hba->host->host_lock, flags); > + if (!err) { > + err = SUCCESS; > + hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL; > + } else { > + err = FAILED; > + hba->ufshcd_state = UFSHCD_STATE_ERROR; > + } > + ufshcd_clear_eh_in_progress(hba); > + spin_unlock_irqrestore(hba->host->host_lock, flags); > + > + return err; > +} > + > +/** > * ufshcd_async_scan - asynchronous execution for link startup > * @data: data pointer to pass to this function > * @cookie: cookie data > @@ -2813,8 +2942,13 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) > goto out; > > ufshcd_force_reset_auto_bkops(hba); > - scsi_scan_host(hba->host); > - pm_runtime_put_sync(hba->dev); > + hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL; > + > + /* If we are in error handling context no need to scan the host */ > + if (!ufshcd_eh_in_progress(hba)) { > + scsi_scan_host(hba->host); > + pm_runtime_put_sync(hba->dev); > + } > out: > return; > } > @@ -2827,8 +2961,8 @@ static struct scsi_host_template ufshcd_driver_template = { > .slave_alloc = ufshcd_slave_alloc, > .slave_destroy = ufshcd_slave_destroy, > .eh_abort_handler = ufshcd_abort, > - .eh_device_reset_handler = ufshcd_device_reset, > - .eh_host_reset_handler = ufshcd_host_reset, > + .eh_device_reset_handler = ufshcd_eh_device_reset_handler, > + .eh_host_reset_handler = ufshcd_eh_host_reset_handler, > .this_id = -1, > .sg_tablesize = SG_ALL, > .cmd_per_lun = UFSHCD_CMD_PER_LUN, > diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h > index 84d09d1..48c7d9b 100644 > --- a/drivers/scsi/ufs/ufshcd.h > +++ b/drivers/scsi/ufs/ufshcd.h > @@ -180,6 +180,7 @@ struct ufs_dev_cmd { > * @pwr_done: completion for power mode change > * @tm_condition: condition variable for task management > * @ufshcd_state: UFSHCD states > + * @eh_flags: Error handling flags > * @intr_mask: Interrupt Mask Bits > * @ee_ctrl_mask: Exception event control mask > * @feh_workq: Work queue for fatal controller error handling > @@ -227,6 +228,7 @@ struct ufs_hba { > struct completion *pwr_done; > > u32 ufshcd_state; > + u32 eh_flags; > u32 intr_mask; > u16 ee_ctrl_mask; > Acked-by: Vinayak Holikatti <vinholikatti@xxxxxxxxx> -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html