Hi Alexander, Thanks for working on this. The patch looks good overall. A few comments below. On Wed, Nov 29, 2023 at 4:15 AM Alexander Lyakas <alex.bolshoy@xxxxxxxxx> wrote: > [...] > > Please kindly note the following: > - md has many features, which we don't use, such as: "non-persistent > arrays", "clustered arrays", "autostart", "journal devices", > "write-behind", "multipath arrays" and others. I cannot say how this > fix will interoperate with these features. > - I tested the fix only on superblock version 1.2. I don't know how > 0.9 superblocks work (because I never used them), so I did not change > anything in super_90_validate(). I think we can leave 0.9 as-is. > - I tested the fix on our production kernel 4.14 LTS. I ported the > changes to the mainline kernel 6.7-rc3, but I do not have the ability > at the moment to test it on this kernel. [...] > drivers/md/md.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 43 insertions(+), 10 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index c94373d..e490275 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -1195,6 +1195,7 @@ struct super_type { > struct md_rdev *refdev, > int minor_version); > int (*validate_super)(struct mddev *mddev, > + struct md_rdev *freshest, > struct md_rdev *rdev); > void (*sync_super)(struct mddev *mddev, > struct md_rdev *rdev); > @@ -1333,7 +1334,7 @@ static int super_90_load(struct md_rdev *rdev, > struct md_rdev *refdev, int minor > /* > * validate_super for 0.90.0 > */ > -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) > +static int super_90_validate(struct mddev *mddev, struct md_rdev > *freshest, struct md_rdev *rdev) > { > mdp_disk_t *desc; > mdp_super_t *sb = page_address(rdev->sb_page); > @@ -1845,7 +1846,7 @@ static int super_1_load(struct md_rdev *rdev, > struct md_rdev *refdev, int minor_ > return ret; > } > > -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) > +static int super_1_validate(struct mddev *mddev, struct md_rdev > *freshest, struct md_rdev *rdev) > { > struct mdp_superblock_1 *sb = page_address(rdev->sb_page); > __u64 ev1 = le64_to_cpu(sb->events); > @@ -1941,13 +1942,15 @@ static int super_1_validate(struct mddev > *mddev, struct md_rdev *rdev) > } > } else if (mddev->pers == NULL) { > /* Insist of good event counter while assembling, except for > - * spares (which don't need an event count) */ > - ++ev1; > + * spares (which don't need an event count). > + * Similar to mdadm, we allow event counter difference of 1 > + * from the freshest device. > + */ > if (rdev->desc_nr >= 0 && > rdev->desc_nr < le32_to_cpu(sb->max_dev) && > (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < > MD_DISK_ROLE_MAX || > le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == > MD_DISK_ROLE_JOURNAL)) > - if (ev1 < mddev->events) > + if (ev1 + 1 < mddev->events) > return -EINVAL; > } else if (mddev->bitmap) { > /* If adding to array with a bitmap, then we can accept an > @@ -1968,8 +1971,38 @@ static int super_1_validate(struct mddev > *mddev, struct md_rdev *rdev) > rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { > role = MD_DISK_ROLE_SPARE; > rdev->desc_nr = -1; > - } else > + } else if (mddev->pers == NULL && freshest != NULL && > ev1 < mddev->events) { > + /* > + * If we are assembling, and our event counter > is smaller than the > + * highest event counter, we cannot trust our > superblock about the role. > + * It could happen that our rdev was marked as > Faulty, and all other > + * superblocks were updated with +1 event counter. > + * Then, before the next superblock update, > which typically happens when > + * remove_and_add_spares() removes the device > from the array, there was > + * a crash or reboot. > + * If we allow current rdev without consulting > the freshest superblock, > + * we could cause data corruption. > + * Note that in this case our event counter is > smaller by 1 than the > + * highest, otherwise, this rdev would not be > allowed into array; > + * both kernel and mdadm allow event counter > difference of 1. > + */ > + struct mdp_superblock_1 *freshest_sb = > page_address(freshest->sb_page); > + u32 freshest_max_dev = > le32_to_cpu(freshest_sb->max_dev); > + > + if (rdev->desc_nr >= freshest_max_dev) { > + /* this is unexpected, better not proceed */ > + pr_warn("md: %s: rdev[%pg]: > desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", > + mdname(mddev), > rdev->bdev, rdev->desc_nr, freshest->bdev, > + freshest_max_dev); > + return -EUCLEAN; > + } > + > + role = > le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); > + pr_warn("md: %s: rdev[%pg]: role=%d(0x%x) > according to freshest %pg\n", > + mdname(mddev), rdev->bdev, role, > role, freshest->bdev); I think this should be a pr_debug(). Or maybe only warn when freshest and current rdev disagree. > + } else { > role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); > + } > switch(role) { > case MD_DISK_ROLE_SPARE: /* spare */ > break; > @@ -2876,7 +2909,7 @@ static int add_bound_rdev(struct md_rdev *rdev) > * and should be added immediately. > */ > super_types[mddev->major_version]. > - validate_super(mddev, rdev); > + validate_super(mddev, NULL/*freshest*/, rdev); > err = mddev->pers->hot_add_disk(mddev, rdev); > if (err) { > md_kick_rdev_from_array(rdev); > @@ -3813,7 +3846,7 @@ static int analyze_sbs(struct mddev *mddev) > } > > super_types[mddev->major_version]. > - validate_super(mddev, freshest); > + validate_super(mddev, NULL/*freshest*/, freshest); > > i = 0; > rdev_for_each_safe(rdev, tmp, mddev) { > @@ -3828,7 +3861,7 @@ static int analyze_sbs(struct mddev *mddev) > } > if (rdev != freshest) { > if (super_types[mddev->major_version]. > - validate_super(mddev, rdev)) { > + validate_super(mddev, freshest, rdev)) { > pr_warn("md: kicking non-fresh %pg > from array!\n", > rdev->bdev); > md_kick_rdev_from_array(rdev); > @@ -6836,7 +6869,7 @@ int md_add_new_disk(struct mddev *mddev, struct > mdu_disk_info_s *info) > rdev->saved_raid_disk = rdev->raid_disk; > } else > super_types[mddev->major_version]. > - validate_super(mddev, rdev); > + validate_super(mddev, NULL/*freshest*/, rdev); > if ((info->state & (1<<MD_DISK_SYNC)) && > rdev->raid_disk != info->raid_disk) { > /* This was a hot-add request, but events doesn't > -- > 1.9.1