On 23/09/22 23:11, Bart Van Assche wrote: > The following deadlock has been observed on multiple test setups: > * ufshcd_wl_suspend() is waiting for blk_execute_rq() to complete while it > holds host_sem. > * ufshcd_eh_host_reset_handler() invokes ufshcd_err_handler() and the > latter function tries to obtain host_sem. > This is a deadlock because blk_execute_rq() can't execute SCSI commands > while the host is in the SHOST_RECOVERY state and because the error > handler cannot make progress either. > > Fix this deadlock by calling the UFS error handler directly during system > suspend instead of relying on the error handling mechanism in the SCSI > core. > > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > drivers/ufs/core/ufshcd.c | 43 ++++++++++++++++++++++++++++++++++++--- > 1 file changed, 40 insertions(+), 3 deletions(-) > > diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c > index abeb120b12eb..996641fc1176 100644 > --- a/drivers/ufs/core/ufshcd.c > +++ b/drivers/ufs/core/ufshcd.c > @@ -6405,9 +6405,19 @@ static void ufshcd_err_handler(struct work_struct *work) > { > struct ufs_hba *hba = container_of(work, struct ufs_hba, eh_work); > > - down(&hba->host_sem); > - ufshcd_recover_link(hba); > - up(&hba->host_sem); > + /* > + * Serialize suspend/resume and error handling because a deadlock > + * occurs if the error handler runs concurrently with > + * ufshcd_set_dev_pwr_mode(). > + */ > + if (mutex_trylock(&system_transition_mutex)) { This is effectively disabling the UFS driver's error handler work. It would be better to add the ability to do that explicitly within the UFS driver and avoid using system_transition_mutex. > + down(&hba->host_sem); > + ufshcd_recover_link(hba); > + up(&hba->host_sem); > + mutex_unlock(&system_transition_mutex); > + } else { > + pr_info("%s: suspend or resume is ongoing\n", __func__); > + } > } > > /** > @@ -8298,6 +8308,25 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) > } > } > > +static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) > +{ > + struct ufs_hba *hba = shost_priv(scmd->device->host); > + > + if (!hba->system_suspending) { Is a PM notifier needed - couldn't hba->system_suspending have been set in ufshcd_wl_suspend() ? Doesn't resume have the same issue ? > + /* Apply the SCSI core error handling strategy. */ > + return SCSI_EH_NOT_HANDLED; > + } > + > + /* > + * Fail START STOP UNIT commands without activating the SCSI error > + * handler to prevent a deadlock between ufshcd_set_dev_pwr_mode() and > + * ufshcd_err_handler(). > + */ > + set_host_byte(scmd, DID_TIME_OUT); > + scsi_done(scmd); > + return SCSI_EH_DONE; > +} > + > static const struct attribute_group *ufshcd_driver_groups[] = { > &ufs_sysfs_unit_descriptor_group, > &ufs_sysfs_lun_attributes_group, > @@ -8332,6 +8361,7 @@ static struct scsi_host_template ufshcd_driver_template = { > .eh_abort_handler = ufshcd_abort, > .eh_device_reset_handler = ufshcd_eh_device_reset_handler, > .eh_host_reset_handler = ufshcd_eh_host_reset_handler, > + .eh_timed_out = ufshcd_eh_timed_out, > .this_id = -1, > .sg_tablesize = SG_ALL, > .cmd_per_lun = UFSHCD_CMD_PER_LUN, > @@ -8791,6 +8821,13 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, > break; > if (host_byte(ret) == 0 && scsi_status_is_good(ret)) > break; > + /* > + * Calling the error handler directly when suspending or > + * resuming the system since the callers of this function hold > + * hba->host_sem in that case. Runtime PM doesn't hold host_sem > + */ > + if (host_byte(ret) != 0 && hba->system_suspending) > + ufshcd_recover_link(hba); > } > if (ret) { > sdev_printk(KERN_WARNING, sdp,