[PATCH] md/raid10: exit sync request if MD_RECOVERY_INTR is set

Yufen Yu <yuyufen@xxxxxxxxxx> · Sat, 21 Apr 2018 14:57:24 +0800

We met a sync thread stuck as follows:

[<0>] raise_barrier+0x90/0x180
[<0>] raid10_sync_request+0x715/0x1d80
[<0>] md_do_sync+0x983/0xfa0
[<0>] md_thread+0x11c/0x160
[<0>] kthread+0x111/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff

At the same time, there is a stuck mdadm thread (mdadm --manage
/dev/md2 --add /dev/sda). It is trying to stop the sync thread:

[<0>] kthread_stop+0x42/0xf0
[<0>] md_unregister_thread+0x3a/0x70
[<0>] md_reap_sync_thread+0x15/0x160
[<0>] action_store+0x142/0x2a0
[<0>] md_attr_store+0x6c/0xb0
[<0>] kernfs_fop_write+0x102/0x180
[<0>] __vfs_write+0x33/0x170
[<0>] vfs_write+0xad/0x1a0
[<0>] SyS_write+0x52/0xc0
[<0>] do_syscall_64+0x6e/0x190
[<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[<0>] 0xffffffffffffffff

Debug tools show that the sync thread is waiting in raise_barrier(),
until raid10d() end all normal IO bios into bio_end_io_list(introduced
in commit 95af587e95aac). But, raid10d() cannot end these bios if
MD_CHANGE_PENDING bit is set. It needs to get mddev->reconfig_mutex lock
and then clear the bit in md_check_recovery().
However, the lock is holding by mdadm in action_store().

Thus, there is a loop:
mdadm waiting for sync thread to stop, sync thread waiting for
raid10d() to end bios, raid10d() waiting for mdadm to release
mddev->reconfig_mutex lock and then it can end bios.

Fix this by checking MD_RECOVERY_INTR while waiting in raise_barrier(),
so that sync thread can exit while mdadm is stoping the sync thread.

In addition, we add a new function raid10_quiesce_barrier()
for raid10_quiesce() to maintain the original semantices.

Fixes: 95af587e95aac ("md/raid10: ensure device failure recorded before write request returns.")
Signed-off-by: Yufen Yu <yuyufen@xxxxxxxxxx>
---
 drivers/md/raid10.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c60774c8430..17aa4fa8a654 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -960,7 +960,7 @@ static void flush_pending_writes(struct r10conf *conf)
  *    lower_barrier when the particular background IO completes.
  */
 
-static void raise_barrier(struct r10conf *conf, int force)
+static sector_t raise_barrier(struct r10conf *conf, int force)
 {
 	BUG_ON(force && !conf->barrier);
 	spin_lock_irq(&conf->resync_lock);
@@ -974,10 +974,20 @@ static void raise_barrier(struct r10conf *conf, int force)
 
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
-			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
+			    (!atomic_read(&conf->nr_pending) &&
+				 conf->barrier < RESYNC_DEPTH) ||
+				test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
 			    conf->resync_lock);
 
+	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		conf->barrier--;
+		spin_unlock_irq(&conf->resync_lock);
+		wake_up(&conf->wait_barrier);
+		return -EINTR;
+	}
+
 	spin_unlock_irq(&conf->resync_lock);
+	return 0;
 }
 
 static void lower_barrier(struct r10conf *conf)
@@ -3130,9 +3140,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				atomic_inc(&mreplace->nr_pending);
 			rcu_read_unlock();
 
+			if (raise_barrier(conf, rb2 != NULL))
+				continue;
+
 			r10_bio = raid10_alloc_init_r10buf(conf);
 			r10_bio->state = 0;
-			raise_barrier(conf, rb2 != NULL);
 			atomic_set(&r10_bio->remaining, 0);
 
 			r10_bio->master_bio = (struct bio*)rb2;
@@ -3349,12 +3361,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		if (sync_blocks < max_sync)
 			max_sync = sync_blocks;
+
+		if (raise_barrier(conf, 0))
+			return 0;
+
 		r10_bio = raid10_alloc_init_r10buf(conf);
 		r10_bio->state = 0;
 
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
-		raise_barrier(conf, 0);
 		conf->next_resync = sector_nr;
 
 		r10_bio->master_bio = NULL;
@@ -3976,12 +3991,32 @@ static void raid10_free(struct mddev *mddev, void *priv)
 	kfree(conf);
 }
 
+static void raid10_quiesce_barrier(struct r10conf *conf)
+{
+	spin_lock_irq(&conf->resync_lock);
+
+	/* Wait until no block IO is waiting (unless 'force') */
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+			    conf->resync_lock);
+
+	/* block any new IO from starting */
+	conf->barrier++;
+
+	/* Now wait for all pending IO to complete */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !atomic_read(&conf->nr_pending) &&
+				 conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock);
+
+	spin_unlock_irq(&conf->resync_lock);
+}
+
 static void raid10_quiesce(struct mddev *mddev, int quiesce)
 {
 	struct r10conf *conf = mddev->private;
 
 	if (quiesce)
-		raise_barrier(conf, 0);
+		raid10_quiesce_barrier(conf);
 	else
 		lower_barrier(conf);
 }
@@ -4527,9 +4562,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 
 read_more:
 	/* Now schedule reads for blocks from sector_nr to last */
+	if (raise_barrier(conf, sectors_done != 0))
+		return 0;
+
 	r10_bio = raid10_alloc_init_r10buf(conf);
 	r10_bio->state = 0;
-	raise_barrier(conf, sectors_done != 0);
 	atomic_set(&r10_bio->remaining, 0);
 	r10_bio->mddev = mddev;
 	r10_bio->sector = sector_nr;
-- 
2.13.6

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html