Looks like there is a race between md_write_start() and raid5d() which can cause deadlock. Run into this issue while raid_check is running. md_write_start: raid5d: if (mddev->safemode == 1) mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ if (mddev->in_sync || mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); >>> running before md_write_start wake up it if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); /* * Waiting on MD_SB_CHANGE_PENDING below may deadlock * seeing md_check_recovery() is needed to clear * the flag when using mdmon. */ continue; } wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), conf->device_lock); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock(&mddev->lock); } ... wait_event(mddev->sb_wait, >>>>>>>>>> hung !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || mddev->suspended); commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d") introduced this issue, since it want to a reschedule for flushing blk_plug, let do it explicitly to address that issue. Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d") Signed-off-by: Junxiao Bi <junxiao.bi@xxxxxxxxxx> --- block/blk-core.c | 1 + drivers/md/raid5.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9d51e9894ece..bc8757a78477 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); } +EXPORT_SYMBOL(__blk_flush_plug); /** * blk_finish_plug - mark the end of a batch of submitted I/O diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 284cd71bcc68..25ec82f2813f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread) * the flag when using mdmon. */ continue; + } else { + spin_unlock_irq(&conf->device_lock); + blk_flush_plug(current); + cond_resched(); + spin_lock_irq(&conf->device_lock); } - - wait_event_lock_irq(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), - conf->device_lock); } pr_debug("%d stripes handled\n", handled); -- 2.39.3 (Apple Git-145)