Reviewed-by: Yaniv Gardi <ygardi@xxxxxxxxxxxxxx> QUALCOMM ISRAEL, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation = > -----Original Message----- = > From: linux-scsi-owner@xxxxxxxxxxxxxxx [mailto:linux-scsi- = > owner@xxxxxxxxxxxxxxx] On Behalf Of Dolev Raviv = > Sent: Monday, August 12, 2013 4:03 PM = > To: Sujit Reddy Thumma = > Cc: Vinayak Holikatti; Santosh Y; James E.J. Bottomley; linux- = > scsi@xxxxxxxxxxxxxxx; Sujit Reddy Thumma; linux-arm- = > msm@xxxxxxxxxxxxxxx = > Subject: Re: [PATCH V5 4/4] scsi: ufs: Improve UFS fatal error handling = > = > Tested-by: Dolev Raviv <draviv@xxxxxxxxxxxxxx> = > = > > Error handling in UFS driver is broken and resets the host controller = > > for fatal errors without re-initialization. Correct the fatal error = > > handling sequence according to UFS Host Controller Interface (HCI) = > > v1.1 specification. = > > = > > o Processed requests which are completed w/wo error are reported to = > > SCSI layer and any pending commands that are not started are aborted = > > in the controller and re-queued into scsi mid-layer queue. = > > = > > o Upon determining fatal error condition the host controller may hang = > > forever until a reset is applied. Block SCSI layer for sending new = > > requests and apply reset in a separate error handling work. = > > = > > o SCSI is informed about the expected Unit-Attention exception from the = > > device for the immediate command after a reset so that the SCSI layer = > > take necessary steps to establish communication with the device. = > > = > > Signed-off-by: Sujit Reddy Thumma <sthumma@xxxxxxxxxxxxxx> = > > --- = > > drivers/scsi/ufs/ufshcd.c | 229 = > > ++++++++++++++++++++++++++++----------------- = > > drivers/scsi/ufs/ufshcd.h | 10 ++- = > > 2 files changed, 149 insertions(+), 90 deletions(-) = > > = > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c = > > index c577e6e..4dca9b4 100644 = > > --- a/drivers/scsi/ufs/ufshcd.c = > > +++ b/drivers/scsi/ufs/ufshcd.c = > > @@ -79,6 +79,14 @@ enum { = > > UFSHCD_EH_IN_PROGRESS = (1 << 0), = > > }; = > > = > > +/* UFSHCD UIC layer error flags */ = > > +enum { = > > + UFSHCD_UIC_DL_PA_INIT_ERROR = (1 << 0), /* Data link layer = > error */ = > > + UFSHCD_UIC_NL_ERROR = (1 << 1), /* Network layer error */ = > > + UFSHCD_UIC_TL_ERROR = (1 << 2), /* Transport Layer error */ = > > + UFSHCD_UIC_DME_ERROR = (1 << 3), /* DME error */ }; = > > + = > > /* Interrupt configuration options */ enum { = > > UFSHCD_INT_DISABLE, = > > @@ -101,6 +109,8 @@ enum { = > > = > > static void ufshcd_tmc_handler(struct ufs_hba *hba); static void = > > ufshcd_async_scan(void *data, async_cookie_t cookie); = > > +static int ufshcd_reset_and_restore(struct ufs_hba *hba); static int = > > +ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag); = > > = > > /* = > > * ufshcd_wait_for_register - wait for register value to change @@ = > > -1523,9 +1533,6 @@ static int ufshcd_make_hba_operational(struct = > > ufs_hba *hba) = > > goto out; = > > } = > > = > > - if (hba->ufshcd_state == UFSHCD_STATE_RESET) = > > - scsi_unblock_requests(hba->host); = > > - = > > out: = > > return err; = > > } = > > @@ -1651,66 +1658,6 @@ static int ufshcd_verify_dev_init(struct = > > ufs_hba = > > *hba) = > > } = > > = > > /** = > > - * ufshcd_do_reset - reset the host controller = > > - * @hba: per adapter instance = > > - * = > > - * Returns SUCCESS/FAILED = > > - */ = > > -static int ufshcd_do_reset(struct ufs_hba *hba) -{ = > > - struct ufshcd_lrb *lrbp; = > > - unsigned long flags; = > > - int tag; = > > - = > > - /* block commands from midlayer */ = > > - scsi_block_requests(hba->host); = > > - = > > - spin_lock_irqsave(hba->host->host_lock, flags); = > > - hba->ufshcd_state = UFSHCD_STATE_RESET; = > > - = > > - /* send controller to reset state */ = > > - ufshcd_hba_stop(hba); = > > - spin_unlock_irqrestore(hba->host->host_lock, flags); = > > - = > > - /* abort outstanding commands */ = > > - for (tag = 0; tag < hba->nutrs; tag++) { = > > - if (test_bit(tag, &hba->outstanding_reqs)) { = > > - lrbp = &hba->lrb[tag]; = > > - if (lrbp->cmd) { = > > - scsi_dma_unmap(lrbp->cmd); = > > - lrbp->cmd->result = DID_RESET << 16; = > > - lrbp->cmd->scsi_done(lrbp->cmd); = > > - lrbp->cmd = NULL; = > > - clear_bit_unlock(tag, &hba->lrb_in_use); = > > - } = > > - } = > > - } = > > - = > > - /* complete device management command */ = > > - if (hba->dev_cmd.complete) = > > - complete(hba->dev_cmd.complete); = > > - = > > - /* clear outstanding request/task bit maps */ = > > - hba->outstanding_reqs = 0; = > > - hba->outstanding_tasks = 0; = > > - = > > - /* Host controller enable */ = > > - if (ufshcd_hba_enable(hba)) { = > > - dev_err(hba->dev, = > > - "Reset: Controller initialization failed\n"); = > > - return FAILED; = > > - } = > > - = > > - if (ufshcd_link_startup(hba)) { = > > - dev_err(hba->dev, = > > - "Reset: Link start-up failed\n"); = > > - return FAILED; = > > - } = > > - = > > - return SUCCESS; = > > -} = > > - = > > -/** = > > * ufshcd_slave_alloc - handle initial SCSI device configurations = > > * @sdev: pointer to SCSI device = > > * = > > @@ -1727,6 +1674,9 @@ static int ufshcd_slave_alloc(struct scsi_device = > > *sdev) = > > sdev->use_10_for_ms = 1; = > > scsi_set_tag_type(sdev, MSG_SIMPLE_TAG); = > > = > > + /* allow SCSI layer to restart the device in case of errors */ = > > + sdev->allow_restart = 1; = > > + = > > /* = > > * Inform SCSI Midlayer that the LUN queue depth is same as the = > > * controller queue depth. If a LUN queue depth is less than the = > @@ = > > -1930,6 +1880,9 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, = > > struct ufshcd_lrb *lrbp) = > > case OCS_ABORTED: = > > result |= DID_ABORT << 16; = > > break; = > > + case OCS_INVALID_COMMAND_STATUS: = > > + result |= DID_REQUEUE << 16; = > > + break; = > > case OCS_INVALID_CMD_TABLE_ATTR: = > > case OCS_INVALID_PRDT_ATTR: = > > case OCS_MISMATCH_DATA_BUF_SIZE: = > > @@ -2241,45 +2194,145 @@ out: = > > } = > > = > > /** = > > - * ufshcd_fatal_err_handler - handle fatal errors = > > - * @hba: per adapter instance = > > + * ufshcd_err_handler - handle UFS errors that require s/w attention = > > + * @work: pointer to work structure = > > */ = > > -static void ufshcd_fatal_err_handler(struct work_struct *work) = > > +static void ufshcd_err_handler(struct work_struct *work) = > > { = > > struct ufs_hba *hba; = > > - hba = container_of(work, struct ufs_hba, feh_workq); = > > + unsigned long flags; = > > + u32 err_xfer = 0; = > > + u32 err_tm = 0; = > > + int err = 0; = > > + int tag; = > > + = > > + hba = container_of(work, struct ufs_hba, eh_work); = > > = > > pm_runtime_get_sync(hba->dev); = > > - /* check if reset is already in progress */ = > > - if (hba->ufshcd_state != UFSHCD_STATE_RESET) = > > - ufshcd_do_reset(hba); = > > + = > > + spin_lock_irqsave(hba->host->host_lock, flags); = > > + if (hba->ufshcd_state == UFSHCD_STATE_RESET) { = > > + spin_unlock_irqrestore(hba->host->host_lock, flags); = > > + goto out; = > > + } = > > + = > > + hba->ufshcd_state = UFSHCD_STATE_RESET; = > > + ufshcd_set_eh_in_progress(hba); = > > + = > > + /* Complete requests that have door-bell cleared by h/w */ = > > + ufshcd_transfer_req_compl(hba); = > > + ufshcd_tmc_handler(hba); = > > + spin_unlock_irqrestore(hba->host->host_lock, flags); = > > + = > > + /* Clear pending transfer requests */ = > > + for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs) = > > + if (ufshcd_clear_cmd(hba, tag)) = > > + err_xfer |= 1 << tag; = > > + = > > + /* Clear pending task management requests */ = > > + for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs) = > > + if (ufshcd_clear_tm_cmd(hba, tag)) = > > + err_tm |= 1 << tag; = > > + = > > + /* Complete the requests that are cleared by s/w */ = > > + spin_lock_irqsave(hba->host->host_lock, flags); = > > + ufshcd_transfer_req_compl(hba); = > > + ufshcd_tmc_handler(hba); = > > + spin_unlock_irqrestore(hba->host->host_lock, flags); = > > + = > > + /* Fatal errors need reset */ = > > + if (err_xfer || err_tm || (hba->saved_err & INT_FATAL_ERRORS) || = > > + ((hba->saved_err & UIC_ERROR) && = > > + (hba->saved_uic_err & = > UFSHCD_UIC_DL_PA_INIT_ERROR))) { = > > + err = ufshcd_reset_and_restore(hba); = > > + if (err) { = > > + dev_err(hba->dev, "%s: reset and restore failed\n", = > > + __func__); = > > + hba->ufshcd_state = UFSHCD_STATE_ERROR; = > > + } = > > + /* = > > + * Inform scsi mid-layer that we did reset and allow to = > handle = > > + * Unit Attention properly. = > > + */ = > > + scsi_report_bus_reset(hba->host, 0); = > > + hba->saved_err = 0; = > > + hba->saved_uic_err = 0; = > > + } = > > + ufshcd_clear_eh_in_progress(hba); = > > + = > > +out: = > > + scsi_unblock_requests(hba->host); = > > pm_runtime_put_sync(hba->dev); = > > } = > > = > > /** = > > - * ufshcd_err_handler - Check for fatal errors = > > - * @work: pointer to a work queue structure = > > + * ufshcd_update_uic_error - check and set fatal UIC error flags. = > > + * @hba: per-adapter instance = > > */ = > > -static void ufshcd_err_handler(struct ufs_hba *hba) = > > +static void ufshcd_update_uic_error(struct ufs_hba *hba) = > > { = > > u32 reg; = > > = > > + /* PA_INIT_ERROR is fatal and needs UIC reset */ = > > + reg = ufshcd_readl(hba, = > REG_UIC_ERROR_CODE_DATA_LINK_LAYER); = > > + if (reg & UIC_DATA_LINK_LAYER_ERROR_PA_INIT) = > > + hba->uic_error |= UFSHCD_UIC_DL_PA_INIT_ERROR; = > > + = > > + /* UIC NL/TL/DME errors needs software retry */ = > > + reg = ufshcd_readl(hba, = > REG_UIC_ERROR_CODE_NETWORK_LAYER); = > > + if (reg) = > > + hba->uic_error |= UFSHCD_UIC_NL_ERROR; = > > + = > > + reg = ufshcd_readl(hba, = > REG_UIC_ERROR_CODE_TRANSPORT_LAYER); = > > + if (reg) = > > + hba->uic_error |= UFSHCD_UIC_TL_ERROR; = > > + = > > + reg = ufshcd_readl(hba, REG_UIC_ERROR_CODE_DME); = > > + if (reg) = > > + hba->uic_error |= UFSHCD_UIC_DME_ERROR; = > > + = > > + dev_dbg(hba->dev, "%s: UIC error flags = 0x%08x\n", = > > + __func__, hba->uic_error); = > > +} = > > + = > > +/** = > > + * ufshcd_check_errors - Check for errors that need s/w attention = > > + * @hba: per-adapter instance = > > + */ = > > +static void ufshcd_check_errors(struct ufs_hba *hba) { = > > + bool queue_eh_work = false; = > > + = > > if (hba->errors & INT_FATAL_ERRORS) = > > - goto fatal_eh; = > > + queue_eh_work = true; = > > = > > if (hba->errors & UIC_ERROR) { = > > - reg = ufshcd_readl(hba, = > REG_UIC_ERROR_CODE_DATA_LINK_LAYER); = > > - if (reg & UIC_DATA_LINK_LAYER_ERROR_PA_INIT) = > > - goto fatal_eh; = > > + hba->uic_error = 0; = > > + ufshcd_update_uic_error(hba); = > > + if (hba->uic_error) = > > + queue_eh_work = true; = > > } = > > - return; = > > -fatal_eh: = > > - /* handle fatal errors only when link is functional */ = > > - if (hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL) { = > > - /* block commands at driver layer until error is handled */ = > > - hba->ufshcd_state = UFSHCD_STATE_ERROR; = > > - schedule_work(&hba->feh_workq); = > > + = > > + if (queue_eh_work) { = > > + /* handle fatal errors only when link is functional */ = > > + if (hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL) { = > > + /* block commands from scsi mid-layer */ = > > + scsi_block_requests(hba->host); = > > + = > > + /* transfer error masks to sticky bits */ = > > + hba->saved_err |= hba->errors; = > > + hba->saved_uic_err |= hba->uic_error; = > > + = > > + hba->ufshcd_state = UFSHCD_STATE_ERROR; = > > + schedule_work(&hba->eh_work); = > > + } = > > } = > > + /* = > > + * if (!queue_eh_work) - = > > + * Other errors are either non-fatal where host recovers = > > + * itself without s/w intervention or errors that will be = > > + * handled by the SCSI core layer. = > > + */ = > > } = > > = > > /** = > > @@ -2304,7 +2357,7 @@ static void ufshcd_sl_intr(struct ufs_hba *hba, = > > u32 = > > intr_status) = > > { = > > hba->errors = UFSHCD_ERROR_MASK & intr_status; = > > if (hba->errors) = > > - ufshcd_err_handler(hba); = > > + ufshcd_check_errors(hba); = > > = > > if (intr_status & UIC_COMMAND_COMPL) = > > ufshcd_uic_cmd_compl(hba); = > > @@ -2679,12 +2732,12 @@ static int = > ufshcd_eh_host_reset_handler(struct = > > scsi_cmnd *cmd) = > > */ = > > do { = > > spin_lock_irqsave(hba->host->host_lock, flags); = > > - if (!(work_pending(&hba->feh_workq) || = > > + if (!(work_pending(&hba->eh_work) || = > > hba->ufshcd_state == = > UFSHCD_STATE_RESET)) = > > break; = > > spin_unlock_irqrestore(hba->host->host_lock, flags); = > > dev_dbg(hba->dev, "%s: reset in progress\n", __func__); = > > - flush_work_sync(&hba->feh_workq); = > > + flush_work_sync(&hba->eh_work); = > > } while (1); = > > = > > hba->ufshcd_state = UFSHCD_STATE_RESET; @@ -2918,7 +2971,7 = > @@ int = > > ufshcd_init(struct device *dev, struct ufs_hba **hba_handle, = > > init_waitqueue_head(&hba->tm_tag_wq); = > > = > > /* Initialize work queues */ = > > - INIT_WORK(&hba->feh_workq, ufshcd_fatal_err_handler); = > > + INIT_WORK(&hba->eh_work, ufshcd_err_handler); = > > INIT_WORK(&hba->eeh_work, ufshcd_exception_event_handler); = > > = > > /* Initialize UIC command mutex */ = > > diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h = > > index 1e0585c..8f5624e 100644 = > > --- a/drivers/scsi/ufs/ufshcd.h = > > +++ b/drivers/scsi/ufs/ufshcd.h = > > @@ -182,9 +182,12 @@ struct ufs_dev_cmd { = > > * @eh_flags: Error handling flags = > > * @intr_mask: Interrupt Mask Bits = > > * @ee_ctrl_mask: Exception event control mask = > > - * @feh_workq: Work queue for fatal controller error handling = > > + * @eh_work: Worker to handle UFS errors that require s/w attention = > > * @eeh_work: Worker to handle exception events = > > * @errors: HBA errors = > > + * @uic_error: UFS interconnect layer error status = > > + * @saved_err: sticky error mask = > > + * @saved_uic_err: sticky UIC error mask = > > * @dev_cmd: ufs device management command information = > > * @auto_bkops_enabled: to track whether bkops is enabled in device = > > */ = > > @@ -230,11 +233,14 @@ struct ufs_hba { = > > u16 ee_ctrl_mask; = > > = > > /* Work Queues */ = > > - struct work_struct feh_workq; = > > + struct work_struct eh_work; = > > struct work_struct eeh_work; = > > = > > /* HBA Errors */ = > > u32 errors; = > > + u32 uic_error; = > > + u32 saved_err; = > > + u32 saved_uic_err; = > > = > > /* Device management request data */ = > > struct ufs_dev_cmd dev_cmd; = > > -- = > > QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a = > > member of Code Aurora Forum, hosted by The Linux Foundation. = > > = > > -- = > > To unsubscribe from this list: send the line "unsubscribe linux-scsi" = > > in the body of a message to majordomo@xxxxxxxxxxxxxxx More = > majordomo = > > info at http://vger.kernel.org/majordomo-info.html = > > = > = > = > -- = > QUALCOMM ISRAEL, on behalf of Qualcomm Innovation Center, Inc. is a = > member of Code Aurora Forum, hosted by The Linux Foundation = > = > -- = > To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the = > body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info = > at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html