On 03/10/2021 09:47, Avri Altman wrote: >> Callers of ufshcd_err_handler() expect it to return in an operational >> state. However, the code does not check the state before exiting. >> >> Add a check for the state and perform retries until either success or the >> maximum number of retries is reached. >> >> Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx> >> --- >> drivers/scsi/ufs/ufshcd.c | 30 +++++++++++++++++++++++++----- >> 1 file changed, 25 insertions(+), 5 deletions(-) >> >> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c >> index 16492779d3a6..33f55ecf43de 100644 >> --- a/drivers/scsi/ufs/ufshcd.c >> +++ b/drivers/scsi/ufs/ufshcd.c >> @@ -64,6 +64,9 @@ >> /* maximum number of reset retries before giving up */ >> #define MAX_HOST_RESET_RETRIES 5 >> >> +/* Maximum number of error handler retries before giving up */ >> +#define MAX_ERR_HANDLER_RETRIES 5 >> + >> /* Expose the flag value from utp_upiu_query.value */ >> #define MASK_QUERY_UPIU_FLAG_LOC 0xFF >> >> @@ -6070,12 +6073,14 @@ static bool >> ufshcd_is_pwr_mode_restore_needed(struct ufs_hba *hba) >> static void ufshcd_err_handler(struct Scsi_Host *host) >> { >> struct ufs_hba *hba = shost_priv(host); >> + int retries = MAX_ERR_HANDLER_RETRIES; >> unsigned long flags; >> - bool err_xfer = false; >> - bool err_tm = false; >> - int err = 0, pmc_err; >> - int tag; >> - bool needs_reset = false, needs_restore = false; >> + bool needs_restore; >> + bool needs_reset; >> + bool err_xfer; >> + bool err_tm; >> + int pmc_err; >> + int tag; >> >> down(&hba->host_sem); >> spin_lock_irqsave(hba->host->host_lock, flags); >> @@ -6093,6 +6098,12 @@ static void ufshcd_err_handler(struct Scsi_Host >> *host) >> /* Complete requests that have door-bell cleared by h/w */ >> ufshcd_complete_requests(hba); >> spin_lock_irqsave(hba->host->host_lock, flags); >> +again: >> + needs_restore = false; >> + needs_reset = false; >> + err_xfer = false; >> + err_tm = false; >> + >> if (hba->ufshcd_state != UFSHCD_STATE_ERROR) >> hba->ufshcd_state = UFSHCD_STATE_RESET; >> /* >> @@ -6213,6 +6224,8 @@ static void ufshcd_err_handler(struct Scsi_Host >> *host) >> do_reset: >> /* Fatal errors need reset */ >> if (needs_reset) { >> + int err; >> + >> hba->force_reset = false; >> spin_unlock_irqrestore(hba->host->host_lock, flags); >> err = ufshcd_reset_and_restore(hba); >> @@ -6232,6 +6245,13 @@ static void ufshcd_err_handler(struct Scsi_Host >> *host) >> dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x >> saved_uic_err 0x%x", >> __func__, hba->saved_err, hba->saved_uic_err); >> } >> + /* Exit in an operational state or dead */ >> + if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL && >> + hba->ufshcd_state != UFSHCD_STATE_ERROR) { >> + if (--retries) >> + goto again; > Why do you need to retry here as well? Thanks for looking at this. It shouldn't hurt to retry bringing the device back to life. The alternative is UFSHCD_STATE_ERROR which means dead. > ufshcd_reset_and_restore() already exists only if operational or dead? ufshcd_reset_and_restore() isn't the only path. There are also ufshcd_quirk_dl_nac_errors() and ufshcd_config_pwr_mode() and in the future perhaps others. This seems the right place to ensure that the error handler guarantees operational (or dead) status. > > Thanks, > Avri > >> + hba->ufshcd_state = UFSHCD_STATE_ERROR; >> + } >> ufshcd_clear_eh_in_progress(hba); >> spin_unlock_irqrestore(hba->host->host_lock, flags); >> ufshcd_err_handling_unprepare(hba); >> -- >> 2.25.1 >