[PATCH 8/8] scsi: ufs: Fix deadlock between power management and error handler

Bart Van Assche <bvanassche@xxxxxxx> · Fri, 23 Sep 2022 13:11:38 -0700

The following deadlock has been observed on multiple test setups:
* ufshcd_wl_suspend() is waiting for blk_execute_rq() to complete while it
  holds host_sem.
* ufshcd_eh_host_reset_handler() invokes ufshcd_err_handler() and the
  latter function tries to obtain host_sem.
This is a deadlock because blk_execute_rq() can't execute SCSI commands
while the host is in the SHOST_RECOVERY state and because the error
handler cannot make progress either.

Fix this deadlock by calling the UFS error handler directly during system
suspend instead of relying on the error handling mechanism in the SCSI
core.

Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
---
 drivers/ufs/core/ufshcd.c | 43 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index abeb120b12eb..996641fc1176 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -6405,9 +6405,19 @@ static void ufshcd_err_handler(struct work_struct *work)
 {
 	struct ufs_hba *hba = container_of(work, struct ufs_hba, eh_work);
 
-	down(&hba->host_sem);
-	ufshcd_recover_link(hba);
-	up(&hba->host_sem);
+	/*
+	 * Serialize suspend/resume and error handling because a deadlock
+	 * occurs if the error handler runs concurrently with
+	 * ufshcd_set_dev_pwr_mode().
+	 */
+	if (mutex_trylock(&system_transition_mutex)) {
+		down(&hba->host_sem);
+		ufshcd_recover_link(hba);
+		up(&hba->host_sem);
+		mutex_unlock(&system_transition_mutex);
+	} else {
+		pr_info("%s: suspend or resume is ongoing\n", __func__);
+	}
 }
 
 /**
@@ -8298,6 +8308,25 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 	}
 }
 
+static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
+{
+	struct ufs_hba *hba = shost_priv(scmd->device->host);
+
+	if (!hba->system_suspending) {
+		/* Apply the SCSI core error handling strategy. */
+		return SCSI_EH_NOT_HANDLED;
+	}
+
+	/*
+	 * Fail START STOP UNIT commands without activating the SCSI error
+	 * handler to prevent a deadlock between ufshcd_set_dev_pwr_mode() and
+	 * ufshcd_err_handler().
+	 */
+	set_host_byte(scmd, DID_TIME_OUT);
+	scsi_done(scmd);
+	return SCSI_EH_DONE;
+}
+
 static const struct attribute_group *ufshcd_driver_groups[] = {
 	&ufs_sysfs_unit_descriptor_group,
 	&ufs_sysfs_lun_attributes_group,
@@ -8332,6 +8361,7 @@ static struct scsi_host_template ufshcd_driver_template = {
 	.eh_abort_handler	= ufshcd_abort,
 	.eh_device_reset_handler = ufshcd_eh_device_reset_handler,
 	.eh_host_reset_handler   = ufshcd_eh_host_reset_handler,
+	.eh_timed_out		= ufshcd_eh_timed_out,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun		= UFSHCD_CMD_PER_LUN,
@@ -8791,6 +8821,13 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba,
 			break;
 		if (host_byte(ret) == 0 && scsi_status_is_good(ret))
 			break;
+		/*
+		 * Calling the error handler directly when suspending or
+		 * resuming the system since the callers of this function hold
+		 * hba->host_sem in that case.
+		 */
+		if (host_byte(ret) != 0 && hba->system_suspending)
+			ufshcd_recover_link(hba);
 	}
 	if (ret) {
 		sdev_printk(KERN_WARNING, sdp,