Hi, Jove
在 2023/05/06 21:07, Jove 写道:
Hi Kuai,
Just to confirm, the array seems fine after the reshape. Copying files now.
Would it be best if I scrap this array and create a new one or is this
array safe to use in the long term? It had to use the --invalid-backup
flag to get it to reshape, so there might be corruption before that
resume point?
I have to do a reshape anyway, to 5 raid devices.
In the meantime, I'll try to fix this deadlock, hope you don't mind a
reported-by tag.
I would not, thank you.
I still have the backup images of the drive in reshape. If you wish I
can test any fix you create.
Here is the first verion of the fixed patch, I fail the io that is
waiting for reshape while reshape can't make progress. I tested in my
VM and it works as I expected. Can you give it a try to see if mdadm
can still assemble?
Thanks,
Kuai
I have no idea why systemd-udevd is accessing the array.
My guess is it is accessing this array is because it checks it for the
lvm layout so it can automatically create the /dev/mapper entries.
With systemd-udevd disabled, these entries to not automatically
appear.
And thank you again for getting me my data back.
Best regards,
Johan
.
From 159ea7c8d591882dfbbdf30938c1c1d5bc9d4931 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@xxxxxxxxxx>
Date: Tue, 9 May 2023 09:28:36 +0800
Subject: [PATCH] md: fix raid456 deadlock
Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
---
drivers/md/md.c | 20 ++++----------------
drivers/md/md.h | 18 ++++++++++++++++++
drivers/md/raid5.c | 32 +++++++++++++++++++++++++++++++-
3 files changed, 53 insertions(+), 17 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e344b4b3444..462529e47f19 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -93,18 +93,6 @@ static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
-enum md_ro_state {
- MD_RDWR,
- MD_RDONLY,
- MD_AUTO_READ,
- MD_MAX_STATE
-};
-
-static bool md_is_rdwr(struct mddev *mddev)
-{
- return (mddev->ro == MD_RDWR);
-}
-
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@@ -360,10 +348,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
-static bool is_md_suspended(struct mddev *mddev)
-{
- return percpu_ref_is_dying(&mddev->active_io);
-}
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@@ -464,6 +448,10 @@ void mddev_suspend(struct mddev *mddev)
wake_up(&mddev->sb_wait);
set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
percpu_ref_kill(&mddev->active_io);
+
+ if (mddev->pers->prepare_suspend)
+ mddev->pers->prepare_suspend(mddev);
+
wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
mddev->pers->quiesce(mddev, 1);
clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fd8f260ed5f8..292b96a15890 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -536,6 +536,23 @@ struct mddev {
bool serialize_policy:1;
};
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static inline bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
+static inline bool is_md_suspended(struct mddev *mddev)
+{
+ return percpu_ref_is_dying(&mddev->active_io);
+}
+
enum recovery_flags {
/*
* If neither SYNC or RESHAPE are set, then it is a recovery.
@@ -614,6 +631,7 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
+ void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 812a12e3e41a..5a24935c113d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -761,6 +761,7 @@ enum stripe_result {
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
+ STRIPE_FAIL_AND_RETRY,
};
struct stripe_request_ctx {
@@ -5997,7 +5998,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- return STRIPE_SCHEDULE_AND_RETRY;
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6076,6 +6078,18 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
+out:
+ /*
+ * There is no point to wait for reshape because reshape can't make
+ * progress if the array is suspended or is not read write.
+ */
+ if (ret == STRIPE_SCHEDULE_AND_RETRY &&
+ (is_md_suspended(mddev) || !md_is_rdwr(mddev))) {
+ bi->bi_status = BLK_STS_IOERR;
+ ret = STRIPE_FAIL;
+ pr_err("md/raid456:%s: array is suspended or not read write, io accross reshape position failed, please try again after reshape.\n",
+ mdname(mddev));
+ }
return ret;
}
@@ -8654,6 +8668,19 @@ static void raid5_finish_reshape(struct mddev *mddev)
}
}
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ /*
+ * Before waiting for active_io to be done, fail all the io that is
+ * waiting for reshape because they can never be done after suspend.
+ *
+ * Perhaps it's better to let those io wait for resume than failing.
+ */
+ wake_up(&conf->wait_for_overlap);
+}
+
static void raid5_quiesce(struct mddev *mddev, int quiesce)
{
struct r5conf *conf = mddev->private;
@@ -9020,6 +9047,7 @@ static struct md_personality raid6_personality =
.check_reshape = raid6_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
+ .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9044,6 +9072,7 @@ static struct md_personality raid5_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
+ .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9069,6 +9098,7 @@ static struct md_personality raid4_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
+ .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
--
2.39.2