We met a sync thread stuck as follows: [<0>] raise_barrier+0x90/0x180 [<0>] raid10_sync_request+0x715/0x1d80 [<0>] md_do_sync+0x983/0xfa0 [<0>] md_thread+0x11c/0x160 [<0>] kthread+0x111/0x130 [<0>] ret_from_fork+0x35/0x40 [<0>] 0xffffffffffffffff At the same time, there is a stuck mdadm thread (mdadm --manage /dev/md2 --add /dev/sda). It is trying to stop the sync thread: [<0>] kthread_stop+0x42/0xf0 [<0>] md_unregister_thread+0x3a/0x70 [<0>] md_reap_sync_thread+0x15/0x160 [<0>] action_store+0x142/0x2a0 [<0>] md_attr_store+0x6c/0xb0 [<0>] kernfs_fop_write+0x102/0x180 [<0>] __vfs_write+0x33/0x170 [<0>] vfs_write+0xad/0x1a0 [<0>] SyS_write+0x52/0xc0 [<0>] do_syscall_64+0x6e/0x190 [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [<0>] 0xffffffffffffffff Debug tools show that the sync thread is waiting in raise_barrier(), until raid10d() end all normal IO bios into bio_end_io_list(introduced in commit 95af587e95aac). But, raid10d() cannot end these bios if MD_CHANGE_PENDING bit is set. It needs to get mddev->reconfig_mutex lock and then clear the bit in md_check_recovery(). However, the lock is holding by mdadm in action_store(). Thus, there is a loop: mdadm waiting for sync thread to stop, sync thread waiting for raid10d() to end bios, raid10d() waiting for mdadm to release mddev->reconfig_mutex lock and then it can end bios. Fix this by checking MD_RECOVERY_INTR while waiting in raise_barrier(), so that sync thread can exit while mdadm is stoping the sync thread. In addition, we add a new function raid10_quiesce_barrier() for raid10_quiesce() to maintain the original semantices. Fixes: 95af587e95aac ("md/raid10: ensure device failure recorded before write request returns.") Signed-off-by: Yufen Yu <yuyufen@xxxxxxxxxx> --- drivers/md/raid10.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3c60774c8430..17aa4fa8a654 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -960,7 +960,7 @@ static void flush_pending_writes(struct r10conf *conf) * lower_barrier when the particular background IO completes. */ -static void raise_barrier(struct r10conf *conf, int force) +static sector_t raise_barrier(struct r10conf *conf, int force) { BUG_ON(force && !conf->barrier); spin_lock_irq(&conf->resync_lock); @@ -974,10 +974,20 @@ static void raise_barrier(struct r10conf *conf, int force) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, + (!atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH) || + test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery), conf->resync_lock); + if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + conf->barrier--; + spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); + return -EINTR; + } + spin_unlock_irq(&conf->resync_lock); + return 0; } static void lower_barrier(struct r10conf *conf) @@ -3130,9 +3140,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&mreplace->nr_pending); rcu_read_unlock(); + if (raise_barrier(conf, rb2 != NULL)) + continue; + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; - raise_barrier(conf, rb2 != NULL); atomic_set(&r10_bio->remaining, 0); r10_bio->master_bio = (struct bio*)rb2; @@ -3349,12 +3361,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } if (sync_blocks < max_sync) max_sync = sync_blocks; + + if (raise_barrier(conf, 0)) + return 0; + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; r10_bio->mddev = mddev; atomic_set(&r10_bio->remaining, 0); - raise_barrier(conf, 0); conf->next_resync = sector_nr; r10_bio->master_bio = NULL; @@ -3976,12 +3991,32 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); } +static void raid10_quiesce_barrier(struct r10conf *conf) +{ + spin_lock_irq(&conf->resync_lock); + + /* Wait until no block IO is waiting (unless 'force') */ + wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + conf->resync_lock); + + /* block any new IO from starting */ + conf->barrier++; + + /* Now wait for all pending IO to complete */ + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH, + conf->resync_lock); + + spin_unlock_irq(&conf->resync_lock); +} + static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private; if (quiesce) - raise_barrier(conf, 0); + raid10_quiesce_barrier(conf); else lower_barrier(conf); } @@ -4527,9 +4562,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_more: /* Now schedule reads for blocks from sector_nr to last */ + if (raise_barrier(conf, sectors_done != 0)) + return 0; + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; - raise_barrier(conf, sectors_done != 0); atomic_set(&r10_bio->remaining, 0); r10_bio->mddev = mddev; r10_bio->sector = sector_nr; -- 2.13.6 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html