Re: [PATCH v3 3/4] sd: Make synchronize cache upon shutdown asynchronous

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 2017-04-20 at 15:13 -0700, James Bottomley wrote:
> On Thu, 2017-04-20 at 21:59 +0000, Bart Van Assche wrote:
> > This approach cannot work. A scsi_target_block() call by the 
> > transport layer can happen concurrently with the 
> > __scsi_remove_device() call and hence can occur at any time between 
> > the scsi_start_queue() call by __scsi_remove_device() and the 
> > sd_shutdown() call, resulting in a deadlock.
> 
> How is that possible?  Once the device goes into the CANCEL state, it
> no longer can be found by starget_for_each_device() because
> scsi_device_get() returns NULL ... unless you also have a patch
> altering that?

No changes were made in the SCSI core other than the attached two patches.
I'm not sure about the root cause but every time I simulated a transport
layer failure before I tried to remove the ib_srp kernel module I ran into
a deadlock (see also the call traces below). Inspection of the lsscsi
output and /sys/kernel/debug/block learned me that both queues involved in
the deadlock were stopped.

sysrq: SysRq : Show Blocked State
  task                        PC stack   pid father
kworker/11:3    D    0  2910      2 0x00000000
Workqueue: srp_remove srp_remove_work [ib_srp]
Call Trace:
 __schedule+0x3df/0xc10
 schedule+0x3d/0x90
 schedule_timeout+0x234/0x4b0
 io_schedule_timeout+0x1e/0x50
 wait_for_completion_io_timeout+0x118/0x180
 blk_execute_rq+0x8e/0xc0
 scsi_execute+0xe7/0x200
 sd_sync_cache+0x8a/0x170
 sd_shutdown+0x5f/0xe0
 sd_remove+0x63/0xc0
 device_release_driver_internal+0x13f/0x1e0
 device_release_driver+0x12/0x20
 bus_remove_device+0x114/0x190
 device_del+0x205/0x320
 __scsi_remove_device+0x132/0x140
 scsi_forget_host+0x60/0x70
 scsi_remove_host+0x71/0x110
 srp_remove_work+0x90/0x220 [ib_srp]
 process_one_work+0x20b/0x6a0
 worker_thread+0x4e/0x4a0
 kthread+0x113/0x150
 ret_from_fork+0x2e/0x40
kworker/4:3     D    0  2913      2 0x00000000
Workqueue: srp_remove srp_remove_work [ib_srp]
Call Trace:
 __schedule+0x3df/0xc10
 schedule+0x3d/0x90
 schedule_timeout+0x234/0x4b0
 io_schedule_timeout+0x1e/0x50
 wait_for_completion_io_timeout+0x118/0x180
 blk_execute_rq+0x8e/0xc0
 scsi_execute+0xe7/0x200
 sd_sync_cache+0x8a/0x170
 sd_shutdown+0x5f/0xe0
 sd_remove+0x63/0xc0
 device_release_driver_internal+0x13f/0x1e0
 device_release_driver+0x12/0x20
 bus_remove_device+0x114/0x190
 device_del+0x205/0x320
 __scsi_remove_device+0x132/0x140
 scsi_forget_host+0x60/0x70
 scsi_remove_host+0x71/0x110
 srp_remove_work+0x90/0x220 [ib_srp]
 process_one_work+0x20b/0x6a0
 worker_thread+0x4e/0x4a0
 kthread+0x113/0x150
 ret_from_fork+0x2e/0x40
modprobe        D    0  2916   2218 0x00000000
Call Trace:
 __schedule+0x3df/0xc10
 schedule+0x3d/0x90
 schedule_timeout+0x273/0x4b0
 wait_for_completion+0x108/0x170
 flush_workqueue+0x207/0x720
 srp_remove_one+0xbe/0x110 [ib_srp]
 ib_unregister_client+0x18f/0x200 [ib_core]
 srp_cleanup_module+0x10/0x618 [ib_srp]
 SyS_delete_module+0x198/0x1f0
 entry_SYSCALL_64_fastpath+0x18/0xad

Bart.
From c395ce2aaf6d8a644311f4c55dfa6aa560a93240 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Date: Tue, 28 Mar 2017 14:00:17 -0700
Subject: [PATCH 1/2] Introduce scsi_start_queue()

This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Cc: Israel Rukshin <israelr@xxxxxxxxxxxx>
Cc: Max Gurtovoy <maxg@xxxxxxxxxxxx>
Cc: Hannes Reinecke <hare@xxxxxxx>
Cc: Benjamin Block <bblock@xxxxxxxxxxxxxxxxxx>
---
 drivers/scsi/scsi_lib.c  | 25 +++++++++++++++----------
 drivers/scsi/scsi_priv.h |  1 +
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eecc005099b2..ffa6e61299a9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2987,6 +2987,20 @@ scsi_internal_device_block(struct scsi_device *sdev, bool wait)
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block);
  
+void scsi_start_queue(struct scsi_device *sdev)
+{
+	struct request_queue *q = sdev->request_queue;
+	unsigned long flags;
+
+	if (q->mq_ops) {
+		blk_mq_start_stopped_hw_queues(q, false);
+	} else {
+		spin_lock_irqsave(q->queue_lock, flags);
+		blk_start_queue(q);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+	}
+}
+
 /**
  * scsi_internal_device_unblock - resume a device after a block request
  * @sdev:	device to resume
@@ -3007,9 +3021,6 @@ int
 scsi_internal_device_unblock(struct scsi_device *sdev,
 			     enum scsi_device_state new_state)
 {
-	struct request_queue *q = sdev->request_queue; 
-	unsigned long flags;
-
 	/*
 	 * Try to transition the scsi device to SDEV_RUNNING or one of the
 	 * offlined states and goose the device queue if successful.
@@ -3027,13 +3038,7 @@ scsi_internal_device_unblock(struct scsi_device *sdev,
 		 sdev->sdev_state != SDEV_OFFLINE)
 		return -EINVAL;
 
-	if (q->mq_ops) {
-		blk_mq_start_stopped_hw_queues(q, false);
-	} else {
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
+	scsi_start_queue(sdev);
 
 	return 0;
 }
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index f11bd102d6d5..c7629e31a75b 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -89,6 +89,7 @@ extern void scsi_run_host_queues(struct Scsi_Host *shost);
 extern void scsi_requeue_run_queue(struct work_struct *work);
 extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
 extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
+extern void scsi_start_queue(struct scsi_device *sdev);
 extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
 extern void scsi_mq_destroy_tags(struct Scsi_Host *shost);
 extern int scsi_init_queue(void);
-- 
2.12.2

From fa69092d22f4f58ede7d37e68148a4aa4615d2ab Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Date: Tue, 18 Apr 2017 10:11:02 -0700
Subject: [PATCH 2/2] Make __scsi_remove_device go straight from BLOCKED to DEL

If a device is blocked, make __scsi_remove_device() cause it to
transition to the DEL state. This means that all the commands
issued in .shutdown() will error in the mid-layer, thus making
the removal proceed without being stopped.

This patch is a slightly modified version of a patch from James
Bottomley.

Signed-off-by: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Cc: James Bottomley <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx>
Cc: Israel Rukshin <israelr@xxxxxxxxxxxx>
Cc: Max Gurtovoy <maxg@xxxxxxxxxxxx>
Cc: Hannes Reinecke <hare@xxxxxxx>
Cc: Benjamin Block <bblock@xxxxxxxxxxxxxxxxxx>
---
 drivers/scsi/scsi_lib.c   |  2 +-
 drivers/scsi/scsi_sysfs.c | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ffa6e61299a9..376cd1da102c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2611,7 +2611,6 @@ scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state)
 		case SDEV_QUIESCE:
 		case SDEV_OFFLINE:
 		case SDEV_TRANSPORT_OFFLINE:
-		case SDEV_BLOCK:
 			break;
 		default:
 			goto illegal;
@@ -2625,6 +2624,7 @@ scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state)
 		case SDEV_OFFLINE:
 		case SDEV_TRANSPORT_OFFLINE:
 		case SDEV_CANCEL:
+		case SDEV_BLOCK:
 		case SDEV_CREATED_BLOCK:
 			break;
 		default:
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 82dfe07b1d47..5b03e58bdd67 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1282,16 +1282,29 @@ void __scsi_remove_device(struct scsi_device *sdev)
 		return;
 
 	if (sdev->is_visible) {
-		if (scsi_device_set_state(sdev, SDEV_CANCEL) != 0)
-			return;
+		/*
+		 * If blocked, we go straight to DEL and restart the queue so
+		 * any commands issued during driver shutdown (like sync
+		 * cache) are errored immediately.
+		 */
+		if (scsi_device_set_state(sdev, SDEV_CANCEL) != 0) {
+			if (scsi_device_set_state(sdev, SDEV_DEL) != 0)
+				return;
+
+			scsi_start_queue(sdev);
+			sdev_printk(KERN_DEBUG, sdev,
+				    "Changed state from BLOCKED to DEL\n");
+		}
 
 		bsg_unregister_queue(sdev->request_queue);
 		device_unregister(&sdev->sdev_dev);
 		transport_remove_device(dev);
 		scsi_dh_remove_device(sdev);
 		device_del(dev);
-	} else
+	} else {
 		put_device(&sdev->sdev_dev);
+		scsi_start_queue(sdev);
+	}
 
 	/*
 	 * Stop accepting new requests and wait until all queuecommand() and
-- 
2.12.2


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [SCSI Target Devel]     [Linux SCSI Target Infrastructure]     [Kernel Newbies]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Linux IIO]     [Samba]     [Device Mapper]

  Powered by Linux