Re: [PATCH V3 3/4] scsi: ufs: Fix device and host reset methods

Sujit Reddy Thumma <sthumma@xxxxxxxxxxxxxx> · Tue, 23 Jul 2013 21:10:56 +0530

On 7/23/2013 1:57 PM, Seungwon Jeon wrote:
> On Sat, July 20, 2013, Sujit Reddy Thumma wrote:
>> On 7/19/2013 7:27 PM, Seungwon Jeon wrote:
>>> On Tue, July 09, 2013, Sujit Reddy Thumma wrote:
>>>> As of now SCSI initiated error handling is broken because,
>>>> the reset APIs don't try to bring back the device initialized and
>>>> ready for further transfers.
>>>>
>>>> In case of timeouts, the scsi error handler takes care of handling aborts
>>>> and resets. Improve the error handling in such scenario by resetting the
>>>> device and host and re-initializing them in proper manner.
>>>>
>>>> Signed-off-by: Sujit Reddy Thumma <sthumma@xxxxxxxxxxxxxx>
>>>> ---
>>>>    drivers/scsi/ufs/ufshcd.c |  467 +++++++++++++++++++++++++++++++++++++++------
>>>>    drivers/scsi/ufs/ufshcd.h |    2 +
>>>>    2 files changed, 411 insertions(+), 58 deletions(-)
>>>>
>>>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>>>> index 51ce096..b4c9910 100644
>>>> --- a/drivers/scsi/ufs/ufshcd.c
>>>> +++ b/drivers/scsi/ufs/ufshcd.c
>>>> @@ -69,9 +69,15 @@ enum {
>>>>
>>>>    /* UFSHCD states */
>>>>    enum {
>>>> -	UFSHCD_STATE_OPERATIONAL,
>>>>    	UFSHCD_STATE_RESET,
>>>>    	UFSHCD_STATE_ERROR,
>>>> +	UFSHCD_STATE_OPERATIONAL,
>>>> +};
>>>> +
>>>> +/* UFSHCD error handling flags */
>>>> +enum {
>>>> +	UFSHCD_EH_HOST_RESET_PENDING = (1 << 0),
>>>> +	UFSHCD_EH_DEVICE_RESET_PENDING = (1 << 1),
>>>>    };
>>>>
>>>>    /* Interrupt configuration options */
>>>> @@ -87,6 +93,22 @@ enum {
>>>>    	INT_AGGR_CONFIG,
>>>>    };
>>>>
>>>> +#define ufshcd_set_device_reset_pending(h) \
>>>> +	(h->eh_flags |= UFSHCD_EH_DEVICE_RESET_PENDING)
>>>> +#define ufshcd_set_host_reset_pending(h) \
>>>> +	(h->eh_flags |= UFSHCD_EH_HOST_RESET_PENDING)
>>>> +#define ufshcd_device_reset_pending(h) \
>>>> +	(h->eh_flags & UFSHCD_EH_DEVICE_RESET_PENDING)
>>>> +#define ufshcd_host_reset_pending(h) \
>>>> +	(h->eh_flags & UFSHCD_EH_HOST_RESET_PENDING)
>>>> +#define ufshcd_clear_device_reset_pending(h) \
>>>> +	(h->eh_flags &= ~UFSHCD_EH_DEVICE_RESET_PENDING)
>>>> +#define ufshcd_clear_host_reset_pending(h) \
>>>> +	(h->eh_flags &= ~UFSHCD_EH_HOST_RESET_PENDING)
>>>> +
>>>> +static void ufshcd_tmc_handler(struct ufs_hba *hba);
>>>> +static void ufshcd_async_scan(void *data, async_cookie_t cookie);
>>>> +
>>>>    /*
>>>>     * ufshcd_wait_for_register - wait for register value to change
>>>>     * @hba - per-adapter interface
>>>> @@ -851,9 +873,22 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>>>>
>>>>    	tag = cmd->request->tag;
>>>>
>>>> -	if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) {
>>>> +	switch (hba->ufshcd_state) {
>>> Lock is no needed for ufshcd_state?
> Please check?

Yes, it is needed. Thanks for catching this.

> 
>>>
>>>> +	case UFSHCD_STATE_OPERATIONAL:
>>>> +		break;
>>>> +	case UFSHCD_STATE_RESET:
>>>>    		err = SCSI_MLQUEUE_HOST_BUSY;
>>>>    		goto out;
>>>> +	case UFSHCD_STATE_ERROR:
>>>> +		set_host_byte(cmd, DID_ERROR);
>>>> +		cmd->scsi_done(cmd);
>>>> +		goto out;
>>>> +	default:
>>>> +		dev_WARN_ONCE(hba->dev, 1, "%s: invalid state %d\n",
>>>> +				__func__, hba->ufshcd_state);
>>>> +		set_host_byte(cmd, DID_BAD_TARGET);
>>>> +		cmd->scsi_done(cmd);
>>>> +		goto out;
>>>>    	}
>>>>
>>>>    	/* acquire the tag to make sure device cmds don't use it */
>>>> @@ -1573,8 +1608,6 @@ static int ufshcd_make_hba_operational(struct ufs_hba *hba)
>>>>    	if (hba->ufshcd_state == UFSHCD_STATE_RESET)
>>>>    		scsi_unblock_requests(hba->host);
>>>>
>>>> -	hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
>>>> -
>>>>    out:
>>>>    	return err;
>>>>    }
>>>> @@ -2273,6 +2306,106 @@ out:
>>>>    }
>>>>
>>>>    /**
>>>> + * ufshcd_utrl_is_rsr_enabled - check if run-stop register is enabled
>>>> + * @hba: per-adapter instance
>>>> + */
>>>> +static bool ufshcd_utrl_is_rsr_enabled(struct ufs_hba *hba)
>>>> +{
>>>> +	return ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_LIST_RUN_STOP) & 0x1;
>>>> +}
>>>> +
>>>> +/**
>>>> + * ufshcd_utmrl_is_rsr_enabled - check if run-stop register is enabled
>>>> + * @hba: per-adapter instance
>>>> + */
>>>> +static bool ufshcd_utmrl_is_rsr_enabled(struct ufs_hba *hba)
>>>> +{
>>>> +	return ufshcd_readl(hba, REG_UTP_TASK_REQ_LIST_RUN_STOP) & 0x1;
>>>> +}
>>>> +
>>>> +/**
>>>> + * ufshcd_complete_pending_tasks - complete outstanding tasks
>>>> + * @hba: per adapter instance
>>>> + *
>>>> + * Abort in-progress task management commands and wakeup
>>>> + * waiting threads.
>>>> + *
>>>> + * Returns non-zero error value when failed to clear all the commands.
>>>> + */
>>>> +static int ufshcd_complete_pending_tasks(struct ufs_hba *hba)
>>>> +{
>>>> +	u32 reg;
>>>> +	int err = 0;
>>>> +	unsigned long flags;
>>>> +
>>>> +	if (!hba->outstanding_tasks)
>>>> +		goto out;
>>>> +
>>>> +	/* Clear UTMRL only when run-stop is enabled */
>>>> +	if (ufshcd_utmrl_is_rsr_enabled(hba))
>>>> +		ufshcd_writel(hba, ~hba->outstanding_tasks,
>>>> +				REG_UTP_TASK_REQ_LIST_CLEAR);
>>>> +
>>>> +	/* poll for max. 1 sec to clear door bell register by h/w */
>>>> +	reg = ufshcd_wait_for_register(hba,
>>>> +			REG_UTP_TASK_REQ_DOOR_BELL,
>>>> +			hba->outstanding_tasks, 0, 1000, 1000);
>>>> +	if (reg & hba->outstanding_tasks)
>>>> +		err = -ETIMEDOUT;
>>>> +
>>>> +	spin_lock_irqsave(hba->host->host_lock, flags);
>>>> +	/* complete commands that were cleared out */
>>>> +	ufshcd_tmc_handler(hba);
>>>> +	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>>> +out:
>>>> +	if (err)
>>>> +		dev_err(hba->dev, "%s: failed, still pending = 0x%.8x\n",
>>>> +				__func__, reg);
>>>> +	return err;
>>>> +}
>>>> +
>>>> +/**
>>>> + * ufshcd_complete_pending_reqs - complete outstanding requests
>>>> + * @hba: per adapter instance
>>>> + *
>>>> + * Abort in-progress transfer request commands and return them to SCSI.
>>>> + *
>>>> + * Returns non-zero error value when failed to clear all the commands.
>>>> + */
>>>> +static int ufshcd_complete_pending_reqs(struct ufs_hba *hba)
>>>> +{
>>>> +	u32 reg;
>>>> +	int err = 0;
>>>> +	unsigned long flags;
>>>> +
>>>> +	/* check if we completed all of them */
>>>> +	if (!hba->outstanding_reqs)
>>>> +		goto out;
>>>> +
>>>> +	/* Clear UTRL only when run-stop is enabled */
>>>> +	if (ufshcd_utrl_is_rsr_enabled(hba))
>>>> +		ufshcd_writel(hba, ~hba->outstanding_reqs,
>>>> +				REG_UTP_TRANSFER_REQ_LIST_CLEAR);
>>>> +
>>>> +	/* poll for max. 1 sec to clear door bell register by h/w */
>>>> +	reg = ufshcd_wait_for_register(hba,
>>>> +			REG_UTP_TRANSFER_REQ_DOOR_BELL,
>>>> +			hba->outstanding_reqs, 0, 1000, 1000);
>>>> +	if (reg & hba->outstanding_reqs)
>>>> +		err = -ETIMEDOUT;
>>>> +
>>>> +	spin_lock_irqsave(hba->host->host_lock, flags);
>>>> +	/* complete commands that were cleared out */
>>>> +	ufshcd_transfer_req_compl(hba);
>>>> +	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>>> +out:
>>>> +	if (err)
>>>> +		dev_err(hba->dev, "%s: failed, still pending = 0x%.8x\n",
>>>> +				__func__, reg);
>>>> +	return err;
>>>> +}
>>>> +
>>>> +/**
>>>>     * ufshcd_fatal_err_handler - handle fatal errors
>>>>     * @hba: per adapter instance
>>>>     */
>>>> @@ -2306,8 +2439,12 @@ static void ufshcd_err_handler(struct ufs_hba *hba)
>>>>    	}
>>>>    	return;
>>>>    fatal_eh:
>>>> -	hba->ufshcd_state = UFSHCD_STATE_ERROR;
>>>> -	schedule_work(&hba->feh_workq);
>>>> +	/* handle fatal errors only when link is functional */
>>>> +	if (hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL) {
>>>> +		/* block commands at driver layer until error is handled */
>>>> +		hba->ufshcd_state = UFSHCD_STATE_ERROR;
>>> Locking omitted for ufshcd_state?
>> This is called in interrupt context with spin_lock held.
> Right, I missed it.
> 
>>
>>>
>>>> +		schedule_work(&hba->feh_workq);
>>>> +	}
>>>>    }
>>>>
>>>>    /**
>>>> @@ -2475,75 +2612,155 @@ static int ufshcd_issue_tm_cmd(struct ufs_hba *hba, int lun_id, int
>> task_id,
>>>>    }
>>>>
>>>>    /**
>>>> - * ufshcd_device_reset - reset device and abort all the pending commands
>>>> - * @cmd: SCSI command pointer
>>>> + * ufshcd_dme_end_point_reset - Notify device Unipro to perform reset
>>>> + * @hba: per adapter instance
>>>>     *
>>>> - * Returns SUCCESS/FAILED
>>>> + * UIC_CMD_DME_END_PT_RST resets the UFS device completely, the UFS flags,
>>>> + * attributes and descriptors are reset to default state. Callers are
>>>> + * expected to initialize the whole device again after this.
>>>> + *
>>>> + * Returns zero on success, non-zero on failure
>>>>     */
>>>> -static int ufshcd_device_reset(struct scsi_cmnd *cmd)
>>>> +static int ufshcd_dme_end_point_reset(struct ufs_hba *hba)
>>>>    {
>>>> -	struct Scsi_Host *host;
>>>> -	struct ufs_hba *hba;
>>>> -	unsigned int tag;
>>>> -	u32 pos;
>>>> -	int err;
>>>> -	u8 resp;
>>>> -	struct ufshcd_lrb *lrbp;
>>>> +	struct uic_command uic_cmd = {0};
>>>> +	int ret;
>>>>
>>>> -	host = cmd->device->host;
>>>> -	hba = shost_priv(host);
>>>> -	tag = cmd->request->tag;
>>>> +	uic_cmd.command = UIC_CMD_DME_END_PT_RST;
>>>>
>>>> -	lrbp = &hba->lrb[tag];
>>>> -	err = ufshcd_issue_tm_cmd(hba, lrbp->lun, lrbp->task_tag,
>>>> -			UFS_LOGICAL_RESET, &resp);
>>>> -	if (err || resp != UPIU_TASK_MANAGEMENT_FUNC_COMPL) {
>>>> -		err = FAILED;
>>>> +	ret = ufshcd_send_uic_cmd(hba, &uic_cmd);
>>>> +	if (ret)
>>>> +		dev_err(hba->dev, "%s: error code %d\n", __func__, ret);
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +/**
>>>> + * ufshcd_dme_reset - Local UniPro reset
>>>> + * @hba: per adapter instance
>>>> + *
>>>> + * Returns zero on success, non-zero on failure
>>>> + */
>>>> +static int ufshcd_dme_reset(struct ufs_hba *hba)
>>>> +{
>>>> +	struct uic_command uic_cmd = {0};
>>>> +	int ret;
>>>> +
>>>> +	uic_cmd.command = UIC_CMD_DME_RESET;
>>>> +
>>>> +	ret = ufshcd_send_uic_cmd(hba, &uic_cmd);
>>>> +	if (ret)
>>>> +		dev_err(hba->dev, "%s: error code %d\n", __func__, ret);
>>>> +
>>>> +	return ret;
>>>> +
>>>> +}
>>>> +
>>>> +/**
>>>> + * ufshcd_dme_enable - Local UniPro DME Enable
>>>> + * @hba: per adapter instance
>>>> + *
>>>> + * Returns zero on success, non-zero on failure
>>>> + */
>>>> +static int ufshcd_dme_enable(struct ufs_hba *hba)
>>>> +{
>>>> +	struct uic_command uic_cmd = {0};
>>>> +	int ret;
>>>> +	uic_cmd.command = UIC_CMD_DME_ENABLE;
>>>> +
>>>> +	ret = ufshcd_send_uic_cmd(hba, &uic_cmd);
>>>> +	if (ret)
>>>> +		dev_err(hba->dev, "%s: error code %d\n", __func__, ret);
>>>> +
>>>> +	return ret;
>>>> +
>>>> +}
>>>> +
>>>> +/**
>>>> + * ufshcd_device_reset_and_restore - reset and restore device
>>>> + * @hba: per-adapter instance
>>>> + *
>>>> + * Note that the device reset issues DME_END_POINT_RESET which
>>>> + * may reset entire device and restore device attributes to
>>>> + * default state.
>>>> + *
>>>> + * Returns zero on success, non-zero on failure
>>>> + */
>>>> +static int ufshcd_device_reset_and_restore(struct ufs_hba *hba)
>>>> +{
>>>> +	int err = 0;
>>>> +	u32 reg;
>>>> +
>>>> +	err = ufshcd_dme_end_point_reset(hba);
>>>> +	if (err)
>>>> +		goto out;
>>>> +
>>>> +	/* restore communication with the device */
>>>> +	err = ufshcd_dme_reset(hba);
>>>> +	if (err)
>>>>    		goto out;
>>>> -	} else {
>>>> -		err = SUCCESS;
>>>> -	}
>>>>
>>>> -	for (pos = 0; pos < hba->nutrs; pos++) {
>>>> -		if (test_bit(pos, &hba->outstanding_reqs) &&
>>>> -		    (hba->lrb[tag].lun == hba->lrb[pos].lun)) {
>>>> +	err = ufshcd_dme_enable(hba);
>>>> +	if (err)
>>>> +		goto out;
>>>>
>>>> -			/* clear the respective UTRLCLR register bit */
>>>> -			ufshcd_utrl_clear(hba, pos);
>>>> +	err = ufshcd_dme_link_startup(hba);
>>> UFS_LOGICAL_RESET is no more used?
>>
>> Yes, I don't see any use for this as of now (given that we are using
>> dme_end_point_reset, refer to figure. 7.4 of UFS 1.1 spec). Also, the
>> UFS spec. error handling section doesn't mention anything about
>> LOGICAL_RESET. If you know a valid use case where we need to have LUN
>> reset, please let me know I will bring it back.
> As refered the scsi-mid layer and other host's implementation,
> eh_device_reset_handler(= ufshcd_eh_device_reset_handler) may
> have a role of LOGICAL_RESET for specific lun.

I am still not convinced why we need LOGICAL_RESET. Just because other
SCSI host drivers have it do we really need it for UFS?

> I found that ENDPOINT_RESET is recommended with IS.DFES in spec.

Here in this case, a command hang (scsi timeout) is considered as Device
Fatal Error. If there are some LUN failures the response would still be
transferred but with Unit-Attention condition with sense data. However,
if the command itself hangs, there is something seriously wrong with the
device or the communication. So we first try to reset the device and
then the host. Unlike most of other SCSI HBAs, UFS is point-to-point
(host <--> device) link and if something goes wrong and caused a hang,
mostly would be a serious error and logical unit reset wouldn't help
much.

> 
> Let me add some comments additionally.
> Both 'ufshcd_eh_device_reset_handler' and 'ufshcd_host_reset_and_restore' do almost same things.
> At a glance, it's confused about their role and It is mixed.
> 'ufshcd_reset_and_restore' is eventually called, which is actual part of reset functionality; Once device reset is failed, then
> host reset is tried.
> Actually, that is being handled for each level of error recovery in scsi mid-layer. Please chekc 'drivers/scsi/scsi_error.c'.
> [scsi_eh_ready_devs, scsi_abort_eh_cmnd]
> In this stage, each reset functionality could be separated obviously.

Yes, in that case we are optimistically doing the host reset twice,
just a hope that it recovers before SCSI layer choke and mark the
device as OFFLINE. If you think that this shouldn't be the case and
have a valid reason for not doing so, I will return appropriate error
in the case device reset fails.

> 
>>
>>> ufshcd_device_reset_and_restore have a role of device reset.
>>> Both ufshcd_dme_reset and ufshcd_dme_enable are valid for local one, not for remote.
>>> Should we do those for host including link-startup here?
>>
>> Yes, it is needed. After DME_ENDPOINT_RESET the remote link goes into link down state.
> I want to know more related description. I didn't find it. Could you point that?

Please refer to "Table 121 DME_SAP restrictions" of MIPI Uni-Pro spec.
The spec. doesn't mention about this explicitly but here is the logic
that is derived from the spec.
1) The DME_LINKSTARTUP can be sent only when the link is in down state,
in all other states DME_LINKSTARTUP is ignored.
2) So if we are sending DME_ENDPOINT_RESET then that must ensure that
remote link is in down state, and hence it can receive linkstartup and
establish the communication.

> 
>> To initialize the link, the host needs to send
>> DME_LINKSTARTUP, but according to Uni-Pro spec. the link-startup can
>> only be sent when the local uni-pro is in link-down state. So first
> If it's right you mentioned above, uni-pro state is already in link-down after DME_ENDPOINT_RESET.
> Then, DME_RESET isn't needed.

You are getting confused here -

- State1: before sending DME_ENDPOINT_RESET
	Local Unipro (host) - Link-UP
	Remote Unipro (device) - Link-Up

- State2: after sending DME_ENDPOINT_RESET
	Local Unipro (host) - Link-UP
	Remote Unipro (device) - Link-Down

- State3: After sending DME_RESET+DME_ENABLE
	Local Unipro (host) - Link-Down
	Remote Unipro (device) - Link-Down

- State4: After sending DME_LINKSTARTUP
	Local Unipro(host) - Link-up
	Remote Unipro (device) - Link-up

The local unipro ignores the DME_LINKSTARTUP if we send it before
DME_RESET.

> 
>> we need to get the local unipro from link-up to disabled to link-down
>> using the DME_RESET and DME_ENABLE commands and then issue
>> DME_LINKSTARTUP to re-initialize the link.
> 'ufshcd_hba_enable' can be used instead of both if these are really needed.
> This will do dme_reset and dme_enable.
> 

The only reason for this is that in some implementations the HCE reset
also resets UTP layer in addition to Uni-Pro layer. There is no need
of UTP layer reset for device reset. So explicit DME_RESET and
DME_ENABLE is used. For those implementations which don't do UTP layer
reset then the advantage is instead of wasting CPU cycles in polling for
HCE=1 we depend on UIC interrupts.

-- 
Regards,
Sujit
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html