Hi Song, Thank you for your comments. I will address them. How do you want to proceed? Do you want me to send a formal patch to the list (with proper formatting this time)? As mentioned, I am not able to test this fix on one of the latest kernels, but only on 4.14.99. Thanks, Alex. On Thu, Nov 30, 2023 at 2:38 AM Song Liu <song@xxxxxxxxxx> wrote: > > Hi Alexander, > > Thanks for working on this. > > The patch looks good overall. A few comments below. > > On Wed, Nov 29, 2023 at 4:15 AM Alexander Lyakas <alex.bolshoy@xxxxxxxxx> wrote: > > > [...] > > > > Please kindly note the following: > > - md has many features, which we don't use, such as: "non-persistent > > arrays", "clustered arrays", "autostart", "journal devices", > > "write-behind", "multipath arrays" and others. I cannot say how this > > fix will interoperate with these features. > > - I tested the fix only on superblock version 1.2. I don't know how > > 0.9 superblocks work (because I never used them), so I did not change > > anything in super_90_validate(). > > I think we can leave 0.9 as-is. > > > - I tested the fix on our production kernel 4.14 LTS. I ported the > > changes to the mainline kernel 6.7-rc3, but I do not have the ability > > at the moment to test it on this kernel. > [...] > > drivers/md/md.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- > > 1 file changed, 43 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/md/md.c b/drivers/md/md.c > > index c94373d..e490275 100644 > > --- a/drivers/md/md.c > > +++ b/drivers/md/md.c > > @@ -1195,6 +1195,7 @@ struct super_type { > > struct md_rdev *refdev, > > int minor_version); > > int (*validate_super)(struct mddev *mddev, > > + struct md_rdev *freshest, > > struct md_rdev *rdev); > > void (*sync_super)(struct mddev *mddev, > > struct md_rdev *rdev); > > @@ -1333,7 +1334,7 @@ static int super_90_load(struct md_rdev *rdev, > > struct md_rdev *refdev, int minor > > /* > > * validate_super for 0.90.0 > > */ > > -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) > > +static int super_90_validate(struct mddev *mddev, struct md_rdev > > *freshest, struct md_rdev *rdev) > > { > > mdp_disk_t *desc; > > mdp_super_t *sb = page_address(rdev->sb_page); > > @@ -1845,7 +1846,7 @@ static int super_1_load(struct md_rdev *rdev, > > struct md_rdev *refdev, int minor_ > > return ret; > > } > > > > -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) > > +static int super_1_validate(struct mddev *mddev, struct md_rdev > > *freshest, struct md_rdev *rdev) > > { > > struct mdp_superblock_1 *sb = page_address(rdev->sb_page); > > __u64 ev1 = le64_to_cpu(sb->events); > > @@ -1941,13 +1942,15 @@ static int super_1_validate(struct mddev > > *mddev, struct md_rdev *rdev) > > } > > } else if (mddev->pers == NULL) { > > /* Insist of good event counter while assembling, except for > > - * spares (which don't need an event count) */ > > - ++ev1; > > + * spares (which don't need an event count). > > + * Similar to mdadm, we allow event counter difference of 1 > > + * from the freshest device. > > + */ > > if (rdev->desc_nr >= 0 && > > rdev->desc_nr < le32_to_cpu(sb->max_dev) && > > (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < > > MD_DISK_ROLE_MAX || > > le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == > > MD_DISK_ROLE_JOURNAL)) > > - if (ev1 < mddev->events) > > + if (ev1 + 1 < mddev->events) > > return -EINVAL; > > } else if (mddev->bitmap) { > > /* If adding to array with a bitmap, then we can accept an > > @@ -1968,8 +1971,38 @@ static int super_1_validate(struct mddev > > *mddev, struct md_rdev *rdev) > > rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { > > role = MD_DISK_ROLE_SPARE; > > rdev->desc_nr = -1; > > - } else > > + } else if (mddev->pers == NULL && freshest != NULL && > > ev1 < mddev->events) { > > + /* > > + * If we are assembling, and our event counter > > is smaller than the > > + * highest event counter, we cannot trust our > > superblock about the role. > > + * It could happen that our rdev was marked as > > Faulty, and all other > > + * superblocks were updated with +1 event counter. > > + * Then, before the next superblock update, > > which typically happens when > > + * remove_and_add_spares() removes the device > > from the array, there was > > + * a crash or reboot. > > + * If we allow current rdev without consulting > > the freshest superblock, > > + * we could cause data corruption. > > + * Note that in this case our event counter is > > smaller by 1 than the > > + * highest, otherwise, this rdev would not be > > allowed into array; > > + * both kernel and mdadm allow event counter > > difference of 1. > > + */ > > + struct mdp_superblock_1 *freshest_sb = > > page_address(freshest->sb_page); > > + u32 freshest_max_dev = > > le32_to_cpu(freshest_sb->max_dev); > > + > > + if (rdev->desc_nr >= freshest_max_dev) { > > + /* this is unexpected, better not proceed */ > > + pr_warn("md: %s: rdev[%pg]: > > desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", > > + mdname(mddev), > > rdev->bdev, rdev->desc_nr, freshest->bdev, > > + freshest_max_dev); > > + return -EUCLEAN; > > + } > > + > > + role = > > le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); > > + pr_warn("md: %s: rdev[%pg]: role=%d(0x%x) > > according to freshest %pg\n", > > + mdname(mddev), rdev->bdev, role, > > role, freshest->bdev); > > I think this should be a pr_debug(). Or maybe only warn when > freshest and current rdev disagree. > > > + } else { > > role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); > > + } > > switch(role) { > > case MD_DISK_ROLE_SPARE: /* spare */ > > break; > > @@ -2876,7 +2909,7 @@ static int add_bound_rdev(struct md_rdev *rdev) > > * and should be added immediately. > > */ > > super_types[mddev->major_version]. > > - validate_super(mddev, rdev); > > + validate_super(mddev, NULL/*freshest*/, rdev); > > err = mddev->pers->hot_add_disk(mddev, rdev); > > if (err) { > > md_kick_rdev_from_array(rdev); > > @@ -3813,7 +3846,7 @@ static int analyze_sbs(struct mddev *mddev) > > } > > > > super_types[mddev->major_version]. > > - validate_super(mddev, freshest); > > + validate_super(mddev, NULL/*freshest*/, freshest); > > > > i = 0; > > rdev_for_each_safe(rdev, tmp, mddev) { > > @@ -3828,7 +3861,7 @@ static int analyze_sbs(struct mddev *mddev) > > } > > if (rdev != freshest) { > > if (super_types[mddev->major_version]. > > - validate_super(mddev, rdev)) { > > + validate_super(mddev, freshest, rdev)) { > > pr_warn("md: kicking non-fresh %pg > > from array!\n", > > rdev->bdev); > > md_kick_rdev_from_array(rdev); > > @@ -6836,7 +6869,7 @@ int md_add_new_disk(struct mddev *mddev, struct > > mdu_disk_info_s *info) > > rdev->saved_raid_disk = rdev->raid_disk; > > } else > > super_types[mddev->major_version]. > > - validate_super(mddev, rdev); > > + validate_super(mddev, NULL/*freshest*/, rdev); > > if ((info->state & (1<<MD_DISK_SYNC)) && > > rdev->raid_disk != info->raid_disk) { > > /* This was a hot-add request, but events doesn't > > -- > > 1.9.1