On 27/09/22 21:43, Bart Van Assche wrote: > The following deadlock has been observed on multiple test setups: > * ufshcd_wl_suspend() is waiting for blk_execute_rq() to complete while it > holds host_sem. > * ufshcd_eh_host_reset_handler() invokes ufshcd_err_handler() and the > latter function tries to obtain host_sem. > This is a deadlock because blk_execute_rq() can't execute SCSI commands > while the host is in the SHOST_RECOVERY state and because the error > handler cannot make progress either. > > Fix this deadlock as follows: > * Fail attempts to suspend the system while the SCSI error handler is in > progress. > * If the system is suspending and a START STOP UNIT command times out, > handle the SCSI command timeout from inside the context of the SCSI > timeout handler instead of activating the SCSI error handler. > * Reduce the START STOP UNIT command timeout to one second since on > Android devices a kernel panic is triggered if an attempt to suspend > the system takes more than 20 seconds. One second should be enough for > the START STOP UNIT command since this command completes in less than > a millisecond for the UFS devices I have access to. > > The runtime power management code is not affected by this deadlock since > hba->host_sem is not touched by the runtime power management functions > in the UFS driver. > > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > drivers/ufs/core/ufshcd.c | 51 ++++++++++++++++++++++++++++++++++++++- > 1 file changed, 50 insertions(+), 1 deletion(-) > > diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c > index 5507d93a4bba..010a5d1b984b 100644 > --- a/drivers/ufs/core/ufshcd.c > +++ b/drivers/ufs/core/ufshcd.c > @@ -8295,6 +8295,54 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) > } > } > > +static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) > +{ > + struct ufs_hba *hba = shost_priv(scmd->device->host); > + bool reset_controller = false; > + int tag, ret; > + > + if (!hba->system_suspending) { > + /* Activate the error handler in the SCSI core. */ > + return SCSI_EH_NOT_HANDLED; > + } > + > + /* > + * Handle errors directly to prevent a deadlock between > + * ufshcd_set_dev_pwr_mode() and ufshcd_err_handler(). > + */ > + for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs) { > + ret = ufshcd_try_to_abort_task(hba, tag); > + dev_info(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag, > + hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1, > + ret == 0 ? "succeeded" : "failed"); > + if (ret != 0) { > + reset_controller = true; > + break; > + } > + } > + for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs) { > + ret = ufshcd_clear_tm_cmd(hba, tag); > + dev_info(hba->dev, "Aborting TMF %d %s\n", tag, > + ret == 0 ? "succeeded" : "failed"); > + if (ret != 0) { > + reset_controller = true; > + break; > + } > + } > + if (reset_controller) { > + dev_info(hba->dev, "Resetting controller\n"); > + ufshcd_reset_and_restore(hba); > + if (ufshcd_clear_cmds(hba, 0xffffffff)) > + dev_err(hba->dev, > + "Clearing outstanding commands failed\n"); > + } > + ufshcd_complete_requests(hba); > + dev_info(hba->dev, "%s() finished; outstanding_tasks = %#lx.\n", > + __func__, hba->outstanding_tasks); Would it be possible to reuse the existing error handler rather than creating a mini version? > + > + return hba->outstanding_tasks ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; > +} > + > static const struct attribute_group *ufshcd_driver_groups[] = { > &ufs_sysfs_unit_descriptor_group, > &ufs_sysfs_lun_attributes_group, > @@ -8329,6 +8377,7 @@ static struct scsi_host_template ufshcd_driver_template = { > .eh_abort_handler = ufshcd_abort, > .eh_device_reset_handler = ufshcd_eh_device_reset_handler, > .eh_host_reset_handler = ufshcd_eh_host_reset_handler, > + .eh_timed_out = ufshcd_eh_timed_out, > .this_id = -1, > .sg_tablesize = SG_ALL, > .cmd_per_lun = UFSHCD_CMD_PER_LUN, > @@ -8783,7 +8832,7 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, > */ > for (retries = 3; retries > 0; --retries) { > ret = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, > - START_STOP_TIMEOUT, 0, 0, RQF_PM, NULL); > + 1 * HZ, 0, REQ_FAILFAST_DEV, RQF_PM, NULL); > if (ret < 0) > break; > if (host_byte(ret) == 0 && scsi_status_is_good(ret))