Re: Raid5 to raid6 grow interrupted, mdadm hangs on assemble command

Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> · Tue, 9 May 2023 10:10:32 +0800

Hi, Jove

在 2023/05/06 21:07, Jove 写道:
Hi Kuai,

Just to confirm, the array seems fine after the reshape. Copying files now.

Would it be best if I scrap this array and create a new one or is this
array safe to use in the long term? It had to use the --invalid-backup
flag to get it to reshape, so there might be corruption before that
resume point?

I have to do a reshape anyway, to 5 raid devices.

In the meantime, I'll try to fix this deadlock, hope you don't mind a
reported-by tag.

I would not, thank you.

I still have the backup images of the drive in reshape. If you wish I
can test any fix you create.

Here is the first verion of the fixed patch, I fail the io that is
waiting for reshape while reshape can't make progress. I tested in my
VM and it works as I expected. Can you give it a try to see if mdadm
can still assemble?

Thanks,
Kuai

I have no idea why systemd-udevd is accessing the array.

My guess is it is accessing this array is because it checks it for the
lvm layout so it can automatically create the /dev/mapper entries.
With systemd-udevd disabled, these entries to not automatically
appear.

And thank you again for getting me my data back.

Best regards,

    Johan
.

From 159ea7c8d591882dfbbdf30938c1c1d5bc9d4931 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@xxxxxxxxxx>
Date: Tue, 9 May 2023 09:28:36 +0800
Subject: [PATCH] md: fix raid456 deadlock

Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
---
 drivers/md/md.c    | 20 ++++----------------
 drivers/md/md.h    | 18 ++++++++++++++++++
 drivers/md/raid5.c | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e344b4b3444..462529e47f19 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -93,18 +93,6 @@ static int remove_and_add_spares(struct mddev *mddev,
 				 struct md_rdev *this);
 static void mddev_detach(struct mddev *mddev);
 
-enum md_ro_state {
-	MD_RDWR,
-	MD_RDONLY,
-	MD_AUTO_READ,
-	MD_MAX_STATE
-};
-
-static bool md_is_rdwr(struct mddev *mddev)
-{
-	return (mddev->ro == MD_RDWR);
-}
-
 /*
  * Default number of read corrections we'll attempt on an rdev
  * before ejecting it from the array. We divide the read error
@@ -360,10 +348,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
 static LIST_HEAD(all_mddevs);
 static DEFINE_SPINLOCK(all_mddevs_lock);
 
-static bool is_md_suspended(struct mddev *mddev)
-{
-	return percpu_ref_is_dying(&mddev->active_io);
-}
 /* Rather than calling directly into the personality make_request function,
  * IO requests come here first so that we can check if the device is
  * being suspended pending a reconfiguration.
@@ -464,6 +448,10 @@ void mddev_suspend(struct mddev *mddev)
 	wake_up(&mddev->sb_wait);
 	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
 	percpu_ref_kill(&mddev->active_io);
+
+	if (mddev->pers->prepare_suspend)
+		mddev->pers->prepare_suspend(mddev);
+
 	wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
 	mddev->pers->quiesce(mddev, 1);
 	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fd8f260ed5f8..292b96a15890 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -536,6 +536,23 @@ struct mddev {
 	bool	serialize_policy:1;
 };
 
+enum md_ro_state {
+	MD_RDWR,
+	MD_RDONLY,
+	MD_AUTO_READ,
+	MD_MAX_STATE
+};
+
+static inline bool md_is_rdwr(struct mddev *mddev)
+{
+	return (mddev->ro == MD_RDWR);
+}
+
+static inline bool is_md_suspended(struct mddev *mddev)
+{
+	return percpu_ref_is_dying(&mddev->active_io);
+}
+
 enum recovery_flags {
 	/*
 	 * If neither SYNC or RESHAPE are set, then it is a recovery.
@@ -614,6 +631,7 @@ struct md_personality
 	int (*start_reshape) (struct mddev *mddev);
 	void (*finish_reshape) (struct mddev *mddev);
 	void (*update_reshape_pos) (struct mddev *mddev);
+	void (*prepare_suspend) (struct mddev *mddev);
 	/* quiesce suspends or resumes internal processing.
 	 * 1 - stop new actions and wait for action io to complete
 	 * 0 - return to normal behaviour
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 812a12e3e41a..5a24935c113d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -761,6 +761,7 @@ enum stripe_result {
 	STRIPE_RETRY,
 	STRIPE_SCHEDULE_AND_RETRY,
 	STRIPE_FAIL,
+	STRIPE_FAIL_AND_RETRY,
 };
 
 struct stripe_request_ctx {
@@ -5997,7 +5998,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
 			if (ahead_of_reshape(mddev, logical_sector,
 					     conf->reshape_safe)) {
 				spin_unlock_irq(&conf->device_lock);
-				return STRIPE_SCHEDULE_AND_RETRY;
+				ret = STRIPE_SCHEDULE_AND_RETRY;
+				goto out;
 			}
 		}
 		spin_unlock_irq(&conf->device_lock);
@@ -6076,6 +6078,18 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
 
 out_release:
 	raid5_release_stripe(sh);
+out:
+	/*
+	 * There is no point to wait for reshape because reshape can't make
+	 * progress if the array is suspended or is not read write.
+	 */
+	if (ret == STRIPE_SCHEDULE_AND_RETRY &&
+	    (is_md_suspended(mddev) || !md_is_rdwr(mddev))) {
+		bi->bi_status = BLK_STS_IOERR;
+		ret = STRIPE_FAIL;
+		pr_err("md/raid456:%s: array is suspended or not read write, io accross reshape position failed, please try again after reshape.\n",
+		       mdname(mddev));
+	}
 	return ret;
 }
 
@@ -8654,6 +8668,19 @@ static void raid5_finish_reshape(struct mddev *mddev)
 	}
 }
 
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	/*
+	 * Before waiting for active_io to be done, fail all the io that is
+	 * waiting for reshape because they can never be done after suspend.
+	 *
+	 * Perhaps it's better to let those io wait for resume than failing.
+	 */
+	wake_up(&conf->wait_for_overlap);
+}
+
 static void raid5_quiesce(struct mddev *mddev, int quiesce)
 {
 	struct r5conf *conf = mddev->private;
@@ -9020,6 +9047,7 @@ static struct md_personality raid6_personality =
 	.check_reshape	= raid6_check_reshape,
 	.start_reshape  = raid5_start_reshape,
 	.finish_reshape = raid5_finish_reshape,
+	.prepare_suspend = raid5_prepare_suspend,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
 	.change_consistency_policy = raid5_change_consistency_policy,
@@ -9044,6 +9072,7 @@ static struct md_personality raid5_personality =
 	.check_reshape	= raid5_check_reshape,
 	.start_reshape  = raid5_start_reshape,
 	.finish_reshape = raid5_finish_reshape,
+	.prepare_suspend = raid5_prepare_suspend,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.change_consistency_policy = raid5_change_consistency_policy,
@@ -9069,6 +9098,7 @@ static struct md_personality raid4_personality =
 	.check_reshape	= raid5_check_reshape,
 	.start_reshape  = raid5_start_reshape,
 	.finish_reshape = raid5_finish_reshape,
+	.prepare_suspend = raid5_prepare_suspend,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid4_takeover,
 	.change_consistency_policy = raid5_change_consistency_policy,
-- 
2.39.2