Re: raid6 corruption after assembling with event counter difference of 1

Alexander Lyakas <alex.bolshoy@xxxxxxxxx> · Wed, 29 Nov 2023 14:15:33 +0200

Hi Song,

> Thanks for pointing this out. I think this is the actual problem here. In
> analyze_sbs() we first run validate_super() on the freshest device.
> Then, we run validate_super() on other devices. We should mark the
> device as faulty based on the sb from the freshest device. (which is
> not the case at the moment). I think we need to fix this.
Thanks for your suggestion. Based on it, I prepared a patch[1], which,
after manually reproducing the problem, addresses the issue.

The device with less-by-1 event counter is added to the array as
Faulty when the array is started. Then remove_and_add_spares() ejects
this device from the array, as expected. Later, the user performs:
"mdadm --remove" and "madam --re-add", which rebuilds the device. Data
corruption is avoided with this.

Please kindly note the following:
- md has many features, which we don't use, such as: "non-persistent
arrays", "clustered arrays", "autostart", "journal devices",
"write-behind", "multipath arrays" and others. I cannot say how this
fix will interoperate with these features.
- I tested the fix only on superblock version 1.2. I don't know how
0.9 superblocks work (because I never used them), so I did not change
anything in super_90_validate().
- I tested the fix on our production kernel 4.14 LTS. I ported the
changes to the mainline kernel 6.7-rc3, but I do not have the ability
at the moment to test it on this kernel.

Can you please comment on the patch?

> With the above issue fixed, allowing the event counter to be off by 1 is safe.
> Does this make sense? Did I miss cases?
Yes, now it should be safe.

Thanks,
Alex.

[1]
>From 13ba53f3bc99b207386a69e3ef176fb5113d7ee3 Mon Sep 17 00:00:00 2001
From: Alex Lyakas <alex@xxxxxxxxxx>
Date: Wed, 29 Nov 2023 13:38:22 +0200
Subject: [PATCH] md: upon assembling the array, consult the superblock of the
 freshest device

In case we are adding to the array a device, whose event counter
is less by 1 then the one of the freshest device.

Signed-off-by: Alex Lyakas <alex@xxxxxxxxxx>
---
 drivers/md/md.c | 53 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c94373d..e490275 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1195,6 +1195,7 @@ struct super_type  {
                                          struct md_rdev *refdev,
                                          int minor_version);
        int                 (*validate_super)(struct mddev *mddev,
+                                             struct md_rdev *freshest,
                                              struct md_rdev *rdev);
        void                (*sync_super)(struct mddev *mddev,
                                          struct md_rdev *rdev);
@@ -1333,7 +1334,7 @@ static int super_90_load(struct md_rdev *rdev,
struct md_rdev *refdev, int minor
 /*
  * validate_super for 0.90.0
  */
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev
*freshest, struct md_rdev *rdev)
 {
        mdp_disk_t *desc;
        mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1845,7 +1846,7 @@ static int super_1_load(struct md_rdev *rdev,
struct md_rdev *refdev, int minor_
        return ret;
 }

-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev
*freshest, struct md_rdev *rdev)
 {
        struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
        __u64 ev1 = le64_to_cpu(sb->events);
@@ -1941,13 +1942,15 @@ static int super_1_validate(struct mddev
*mddev, struct md_rdev *rdev)
                }
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling, except for
-                * spares (which don't need an event count) */
-               ++ev1;
+                * spares (which don't need an event count).
+                * Similar to mdadm, we allow event counter difference of 1
+                * from the freshest device.
+                */
                if (rdev->desc_nr >= 0 &&
                    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
                    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) <
MD_DISK_ROLE_MAX ||
                     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) ==
MD_DISK_ROLE_JOURNAL))
-                       if (ev1 < mddev->events)
+                       if (ev1 + 1 < mddev->events)
                                return -EINVAL;
        } else if (mddev->bitmap) {
                /* If adding to array with a bitmap, then we can accept an
@@ -1968,8 +1971,38 @@ static int super_1_validate(struct mddev
*mddev, struct md_rdev *rdev)
                    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
                        role = MD_DISK_ROLE_SPARE;
                        rdev->desc_nr = -1;
-               } else
+               } else if (mddev->pers == NULL && freshest != NULL &&
ev1 < mddev->events) {
+                       /*
+                        * If we are assembling, and our event counter
is smaller than the
+                        * highest event counter, we cannot trust our
superblock about the role.
+                        * It could happen that our rdev was marked as
Faulty, and all other
+                        * superblocks were updated with +1 event counter.
+                        * Then, before the next superblock update,
which typically happens when
+                        * remove_and_add_spares() removes the device
from the array, there was
+                        * a crash or reboot.
+                        * If we allow current rdev without consulting
the freshest superblock,
+                        * we could cause data corruption.
+                        * Note that in this case our event counter is
smaller by 1 than the
+                        * highest, otherwise, this rdev would not be
allowed into array;
+                        * both kernel and mdadm allow event counter
difference of 1.
+                        */
+                       struct mdp_superblock_1 *freshest_sb =
page_address(freshest->sb_page);
+                       u32 freshest_max_dev =
le32_to_cpu(freshest_sb->max_dev);
+
+                       if (rdev->desc_nr >= freshest_max_dev) {
+                               /* this is unexpected, better not proceed */
+                               pr_warn("md: %s: rdev[%pg]:
desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+                                               mdname(mddev),
rdev->bdev, rdev->desc_nr, freshest->bdev,
+                                               freshest_max_dev);
+                               return -EUCLEAN;
+                       }
+
+                       role =
le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+                       pr_warn("md: %s: rdev[%pg]: role=%d(0x%x)
according to freshest %pg\n",
+                                   mdname(mddev), rdev->bdev, role,
role, freshest->bdev);
+               } else {
                        role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+               }
                switch(role) {
                case MD_DISK_ROLE_SPARE: /* spare */
                        break;
@@ -2876,7 +2909,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
                 * and should be added immediately.
                 */
                super_types[mddev->major_version].
-                       validate_super(mddev, rdev);
+                       validate_super(mddev, NULL/*freshest*/, rdev);
                err = mddev->pers->hot_add_disk(mddev, rdev);
                if (err) {
                        md_kick_rdev_from_array(rdev);
@@ -3813,7 +3846,7 @@ static int analyze_sbs(struct mddev *mddev)
        }

        super_types[mddev->major_version].
-               validate_super(mddev, freshest);
+               validate_super(mddev, NULL/*freshest*/, freshest);

        i = 0;
        rdev_for_each_safe(rdev, tmp, mddev) {
@@ -3828,7 +3861,7 @@ static int analyze_sbs(struct mddev *mddev)
                }
                if (rdev != freshest) {
                        if (super_types[mddev->major_version].
-                           validate_super(mddev, rdev)) {
+                           validate_super(mddev, freshest, rdev)) {
                                pr_warn("md: kicking non-fresh %pg
from array!\n",
                                        rdev->bdev);
                                md_kick_rdev_from_array(rdev);
@@ -6836,7 +6869,7 @@ int md_add_new_disk(struct mddev *mddev, struct
mdu_disk_info_s *info)
                        rdev->saved_raid_disk = rdev->raid_disk;
                } else
                        super_types[mddev->major_version].
-                               validate_super(mddev, rdev);
+                               validate_super(mddev, NULL/*freshest*/, rdev);
                if ((info->state & (1<<MD_DISK_SYNC)) &&
                     rdev->raid_disk != info->raid_disk) {
                        /* This was a hot-add request, but events doesn't
--
1.9.1