On 11/20/19 2:22 PM, Song Liu wrote:
On Wed, Nov 20, 2019 at 8:29 AM Nigel Croxon <ncroxon@xxxxxxxxxx> wrote:
The MD driver for level-456 should prevent re-reading read errors.
For redundant raid it makes no sense to retry the operation:
When one of the disks in the array hits a read error, that will
cause a stall for the reading process:
- either the read succeeds (e.g. after 4 seconds the HDD error
strategy could read the sector)
- or it fails after HDD imposed timeout (w/TLER, e.g. after 7
seconds (might be even longer)
I am ok with the idea. But we need to be more careful.
The user can enable/disable this functionality by the following
commands:
To Enable:
echo 1 > /proc/sys/dev/raid/raid456_retry_read_error
To Disable, type the following at anytime:
echo 0 > /proc/sys/dev/raid/raid456_retry_read_error
Version 2:
* Renamed *raid456* to *raid5*.
* Changed set_raid5_retry_re routine to use 'if-then' to make cleaner.
* Added set_bit R5_ReadError in retry_aligned_read routine.
Signed-off-by: Nigel Croxon <ncroxon@xxxxxxxxxx>
---
drivers/md/md.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/md.h | 3 +++
drivers/md/raid5.c | 4 +++-
3 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1be7abeb24fd..6f47489e0b23 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -125,6 +125,15 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sysctl_raid5_retry_read_error = 0;
I don't think we need the sysctl. Per device knob should be sufficient.
Have an example?
Remember, someone might want to enable this before the initial sync.
+static inline void set_raid5_retry_re(struct mddev *mddev, int re)
+{
+ if (re)
+ set_bit(MD_RAID5_RETRY_RE, &mddev->flags);
+ else
+ clear_bit(MD_RAID5_RETRY_RE, &mddev->flags);
+}
+
static int rdev_init_wb(struct md_rdev *rdev)
{
if (rdev->bdev->bd_queue->nr_hw_queues == 1)
@@ -213,6 +222,13 @@ static struct ctl_table raid_table[] = {
.mode = S_IRUGO|S_IWUSR,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "raid5_retry_read_error",
+ .data = &sysctl_raid5_retry_read_error,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -4721,6 +4737,32 @@ mismatch_cnt_show(struct mddev *mddev, char *page)
static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
+static ssize_t
+raid5_retry_re_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "RAID456 retry Read Error = %u\n",
+ test_bit(MD_RAID5_RETRY_RE, &mddev->flags));
+}
+
+static ssize_t raid5_retry_re_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int retry;
+
+ if (!mddev->private)
+ return -ENODEV;
+
+ if (len > 1 ||
+ kstrtoint(buf, 10, &retry) ||
+ retry < 0 || retry > 1)
+ return -EINVAL;
+
+ set_raid5_retry_re(mddev, retry);
+ return len;
+}
+
+static struct md_sysfs_entry md_raid5_retry_read_error =
+__ATTR(raid5_retry_read_error, S_IRUGO|S_IWUSR, raid5_retry_re_show, raid5_retry_re_store);
+
static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
@@ -5272,6 +5314,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_suspend_hi.attr,
&md_bitmap.attr,
&md_degraded.attr,
+ &md_raid5_retry_read_error.attr,
NULL,
};
static struct attribute_group md_redundancy_group = {
@@ -5833,6 +5876,8 @@ static int do_md_run(struct mddev *mddev)
if (mddev_is_clustered(mddev))
md_allow_write(mddev);
+ set_raid5_retry_re(mddev, sysctl_raid5_retry_read_error);
+
/* run start up tasks that require md_thread */
md_start(mddev);
@@ -8411,6 +8456,7 @@ void md_do_sync(struct md_thread *thread)
else
desc = "recovery";
+ set_raid5_retry_re(mddev, sysctl_raid5_retry_read_error);
mddev->last_sync_action = action ?: desc;
/* we overload curr_resync somewhat here.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c5e3ff398b59..6703a7d0b633 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -254,6 +254,9 @@ enum mddev_flags {
MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
* I/O in case an array member is gone/failed.
*/
+ MD_RAID5_RETRY_RE, /* allow user-space to request RAID456
+ * retry read errors
+ */
The use of "RE" and "re" is not clear. Let's just keep full name like
*retry_read.
Also, please keep the default as "do retry". So it is the same behavior as
older kernels.
I agree, and also like the full name. Will change for next time.
Agreed on the second point. Default is to retry.
+static int sysctl_raid5_retry_read_error = 0;
};
enum mddev_sb_flags {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 223e97ab27e6..0b627fface78 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2567,7 +2567,8 @@ static void raid5_end_read_request(struct bio * bi)
if (retry)
Can we include checks for MD_RAID5_RETRY_RE in the logic to decide whether
to do "retry = 1"? I think that will keep the logic cleaner.
yes, but it does duplicate 2 lines of code.
if (sh->qd_idx >= 0 && sh->pd_idx == i)
set_bit(R5_ReadError, &sh->dev[i].flags);
- else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ else if ((test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) ||
+ (test_bit(MD_RAID5_RETRY_RE, &conf->mddev->flags))) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -6163,6 +6164,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
}
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
+ set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
Do we need this w/ and w/o MD_RAID5_RETRY_RE
I will double check on this.