Re: raid6 corruption after assembling with event counter difference of 1

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Alexander,

Thanks for working on this.

The patch looks good overall. A few comments below.

On Wed, Nov 29, 2023 at 4:15 AM Alexander Lyakas <alex.bolshoy@xxxxxxxxx> wrote:
>
[...]
>
> Please kindly note the following:
> - md has many features, which we don't use, such as: "non-persistent
> arrays", "clustered arrays", "autostart", "journal devices",
> "write-behind", "multipath arrays" and others. I cannot say how this
> fix will interoperate with these features.
> - I tested the fix only on superblock version 1.2. I don't know how
> 0.9 superblocks work (because I never used them), so I did not change
> anything in super_90_validate().

I think we can leave 0.9 as-is.

> - I tested the fix on our production kernel 4.14 LTS. I ported the
> changes to the mainline kernel 6.7-rc3, but I do not have the ability
> at the moment to test it on this kernel.
[...]
>  drivers/md/md.c | 53 +++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 43 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index c94373d..e490275 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -1195,6 +1195,7 @@ struct super_type  {
>                                           struct md_rdev *refdev,
>                                           int minor_version);
>         int                 (*validate_super)(struct mddev *mddev,
> +                                             struct md_rdev *freshest,
>                                               struct md_rdev *rdev);
>         void                (*sync_super)(struct mddev *mddev,
>                                           struct md_rdev *rdev);
> @@ -1333,7 +1334,7 @@ static int super_90_load(struct md_rdev *rdev,
> struct md_rdev *refdev, int minor
>  /*
>   * validate_super for 0.90.0
>   */
> -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
> +static int super_90_validate(struct mddev *mddev, struct md_rdev
> *freshest, struct md_rdev *rdev)
>  {
>         mdp_disk_t *desc;
>         mdp_super_t *sb = page_address(rdev->sb_page);
> @@ -1845,7 +1846,7 @@ static int super_1_load(struct md_rdev *rdev,
> struct md_rdev *refdev, int minor_
>         return ret;
>  }
>
> -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
> +static int super_1_validate(struct mddev *mddev, struct md_rdev
> *freshest, struct md_rdev *rdev)
>  {
>         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
>         __u64 ev1 = le64_to_cpu(sb->events);
> @@ -1941,13 +1942,15 @@ static int super_1_validate(struct mddev
> *mddev, struct md_rdev *rdev)
>                 }
>         } else if (mddev->pers == NULL) {
>                 /* Insist of good event counter while assembling, except for
> -                * spares (which don't need an event count) */
> -               ++ev1;
> +                * spares (which don't need an event count).
> +                * Similar to mdadm, we allow event counter difference of 1
> +                * from the freshest device.
> +                */
>                 if (rdev->desc_nr >= 0 &&
>                     rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
>                     (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) <
> MD_DISK_ROLE_MAX ||
>                      le16_to_cpu(sb->dev_roles[rdev->desc_nr]) ==
> MD_DISK_ROLE_JOURNAL))
> -                       if (ev1 < mddev->events)
> +                       if (ev1 + 1 < mddev->events)
>                                 return -EINVAL;
>         } else if (mddev->bitmap) {
>                 /* If adding to array with a bitmap, then we can accept an
> @@ -1968,8 +1971,38 @@ static int super_1_validate(struct mddev
> *mddev, struct md_rdev *rdev)
>                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
>                         role = MD_DISK_ROLE_SPARE;
>                         rdev->desc_nr = -1;
> -               } else
> +               } else if (mddev->pers == NULL && freshest != NULL &&
> ev1 < mddev->events) {
> +                       /*
> +                        * If we are assembling, and our event counter
> is smaller than the
> +                        * highest event counter, we cannot trust our
> superblock about the role.
> +                        * It could happen that our rdev was marked as
> Faulty, and all other
> +                        * superblocks were updated with +1 event counter.
> +                        * Then, before the next superblock update,
> which typically happens when
> +                        * remove_and_add_spares() removes the device
> from the array, there was
> +                        * a crash or reboot.
> +                        * If we allow current rdev without consulting
> the freshest superblock,
> +                        * we could cause data corruption.
> +                        * Note that in this case our event counter is
> smaller by 1 than the
> +                        * highest, otherwise, this rdev would not be
> allowed into array;
> +                        * both kernel and mdadm allow event counter
> difference of 1.
> +                        */
> +                       struct mdp_superblock_1 *freshest_sb =
> page_address(freshest->sb_page);
> +                       u32 freshest_max_dev =
> le32_to_cpu(freshest_sb->max_dev);
> +
> +                       if (rdev->desc_nr >= freshest_max_dev) {
> +                               /* this is unexpected, better not proceed */
> +                               pr_warn("md: %s: rdev[%pg]:
> desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
> +                                               mdname(mddev),
> rdev->bdev, rdev->desc_nr, freshest->bdev,
> +                                               freshest_max_dev);
> +                               return -EUCLEAN;
> +                       }
> +
> +                       role =
> le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
> +                       pr_warn("md: %s: rdev[%pg]: role=%d(0x%x)
> according to freshest %pg\n",
> +                                   mdname(mddev), rdev->bdev, role,
> role, freshest->bdev);

I think this should be a pr_debug(). Or maybe only warn when
freshest and current rdev disagree.

> +               } else {
>                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
> +               }
>                 switch(role) {
>                 case MD_DISK_ROLE_SPARE: /* spare */
>                         break;
> @@ -2876,7 +2909,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
>                  * and should be added immediately.
>                  */
>                 super_types[mddev->major_version].
> -                       validate_super(mddev, rdev);
> +                       validate_super(mddev, NULL/*freshest*/, rdev);
>                 err = mddev->pers->hot_add_disk(mddev, rdev);
>                 if (err) {
>                         md_kick_rdev_from_array(rdev);
> @@ -3813,7 +3846,7 @@ static int analyze_sbs(struct mddev *mddev)
>         }
>
>         super_types[mddev->major_version].
> -               validate_super(mddev, freshest);
> +               validate_super(mddev, NULL/*freshest*/, freshest);
>
>         i = 0;
>         rdev_for_each_safe(rdev, tmp, mddev) {
> @@ -3828,7 +3861,7 @@ static int analyze_sbs(struct mddev *mddev)
>                 }
>                 if (rdev != freshest) {
>                         if (super_types[mddev->major_version].
> -                           validate_super(mddev, rdev)) {
> +                           validate_super(mddev, freshest, rdev)) {
>                                 pr_warn("md: kicking non-fresh %pg
> from array!\n",
>                                         rdev->bdev);
>                                 md_kick_rdev_from_array(rdev);
> @@ -6836,7 +6869,7 @@ int md_add_new_disk(struct mddev *mddev, struct
> mdu_disk_info_s *info)
>                         rdev->saved_raid_disk = rdev->raid_disk;
>                 } else
>                         super_types[mddev->major_version].
> -                               validate_super(mddev, rdev);
> +                               validate_super(mddev, NULL/*freshest*/, rdev);
>                 if ((info->state & (1<<MD_DISK_SYNC)) &&
>                      rdev->raid_disk != info->raid_disk) {
>                         /* This was a hot-add request, but events doesn't
> --
> 1.9.1





[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux