From: Yu Kuai <yukuai3@xxxxxxxxxx> For raid456, if reshape is still in progress, then IO across reshape position will wait for reshape to make progress. However, for dm-raid, in following cases reshape will never make progress hence IO will hang: 1) the array is read-only; 2) MD_RECOVERY_WAIT is set; 3) MD_RECOVERY_FROZEN is set; After commit c467e97f079f ("md/raid6: use valid sector values to determine if an I/O should wait on the reshape") fix the problem that IO across reshape position doesn't wait for reshape, the dm-raid test shell/lvconvert-raid-reshape.sh start to hang: [root@fedora ~]# cat /proc/979/stack [<0>] wait_woken+0x7d/0x90 [<0>] raid5_make_request+0x929/0x1d70 [raid456] [<0>] md_handle_request+0xc2/0x3b0 [md_mod] [<0>] raid_map+0x2c/0x50 [dm_raid] [<0>] __map_bio+0x251/0x380 [dm_mod] [<0>] dm_submit_bio+0x1f0/0x760 [dm_mod] [<0>] __submit_bio+0xc2/0x1c0 [<0>] submit_bio_noacct_nocheck+0x17f/0x450 [<0>] submit_bio_noacct+0x2bc/0x780 [<0>] submit_bio+0x70/0xc0 [<0>] mpage_readahead+0x169/0x1f0 [<0>] blkdev_readahead+0x18/0x30 [<0>] read_pages+0x7c/0x3b0 [<0>] page_cache_ra_unbounded+0x1ab/0x280 [<0>] force_page_cache_ra+0x9e/0x130 [<0>] page_cache_sync_ra+0x3b/0x110 [<0>] filemap_get_pages+0x143/0xa30 [<0>] filemap_read+0xdc/0x4b0 [<0>] blkdev_read_iter+0x75/0x200 [<0>] vfs_read+0x272/0x460 [<0>] ksys_read+0x7a/0x170 [<0>] __x64_sys_read+0x1c/0x30 [<0>] do_syscall_64+0xc6/0x230 [<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 This is because reshape can't make progress. For md/raid, the problem doesn't exist because register new sync_thread doesn't rely on the IO to be done any more: 1) If array is read-only, it can switch to read-write by ioctl/sysfs; 2) md/raid never set MD_RECOVERY_WAIT; 3) If MD_RECOVERY_FROZEN is set, mddev_suspend() doesn't hold 'reconfig_mutex', hence it can be cleared and reshape can continue by sysfs api 'sync_action'. However, I'm not sure yet how to avoid the problem in dm-raid yet. This patch detect the above 3 cases in dm_suspend(), and fail those IO directly. If user really meet the IO error, then it means they're reading the wrong data before c467e97f079f. And it's safe to read/write the array after reshape make progress successfully. Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> --- drivers/md/md.h | 2 +- drivers/md/raid5.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 98da86d38ba8..8e81f9e2fb20 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -565,7 +565,7 @@ enum md_ro_state { MD_MAX_STATE }; -static bool md_is_rdwr(struct mddev *mddev) +static inline bool md_is_rdwr(struct mddev *mddev) { return (mddev->ro == MD_RDWR); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6a7a32f7fb91..812d7ec64da5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5915,6 +5915,13 @@ static int add_all_stripe_bios(struct r5conf *conf, return ret; } +static bool reshape_disabled(struct mddev *mddev) +{ + return !md_is_rdwr(mddev) || + test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5946,7 +5953,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6025,6 +6033,13 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && !mddev->gendisk && + reshape_disabled(mddev)) { + bi->bi_status = BLK_STS_IOERR; + ret = STRIPE_FAIL; + pr_err("dm-raid456: io failed across reshape position while reshape can't make progress"); + } return ret; } @@ -8909,6 +8924,18 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +/* + * This is only used for dm-raid456, caller already frozen sync_thread, hence + * if rehsape is still in progress, io that is waiting for reshape can never be + * done now, hence wake up and handle those IO. + */ +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -8932,6 +8959,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid5_personality = { @@ -8956,6 +8984,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid4_personality = @@ -8981,6 +9010,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static int __init raid5_init(void) -- 2.39.2