Hi Song, On Fri, Dec 8, 2023 at 7:28 AM Song Liu <song@xxxxxxxxxx> wrote: > > Hi Alex, > > On Mon, Dec 4, 2023 at 9:00 AM Alex Lyakas <alex.lyakas@xxxxxxxxxx> wrote: > > > > From: Alex Lyakas <alex@xxxxxxxxxx> > > > > Upon assembling the array, both kernel and mdadm allow the devices to have event > > counter difference of 1, and still consider them as up-to-date. > > However, a device whose event count is behind by 1, may in fact not be up-to-date, > > and array resync with such a device may cause data corruption. > > To avoid this, consult the superblock of the freshest device about the status > > of a device, whose event counter is behind by 1. > > > > Signed-off-by: Alex Lyakas <alex.lyakas@xxxxxxxxxx> > > You are using two different emails for "From:" and "Signed-off-by", which one > would you prefer? > > Please run ./scripts/checkpatch.pl on the .patch file before sending them. It > would help catch issues like code format and email mismatch > > > > > Disclaimer: I was not able to test this change on one of the latest 6.7 kernels. > > I tested it on kernel 4.14 LTS and then ported the changes. > > > > To test this change, I modified the code of remove_and_add_spares(): > > diff --git a/drivers/md/md.c b/drivers/md/md.c > > index ad68b5e..f57854e 100644 > > --- a/drivers/md/md.c > > +++ b/drivers/md/md.c > > This part confuses b4 (patch handling script used by many). It looks like two > patches bundled together. > > > @@ -9341,6 +9341,7 @@ static int remove_and_add_spares(struct mddev *mddev, > > } > > } > > no_add: > > + removed = 0; > > if (removed) > > set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); > > return spares; > > > > With this change, when a device fails, the superblock of all other devices > > is only updated once. During test, I sumulated a failure of one of the devices, > > and then rebooted the machine. After reboot, I re-assembled the array > > with all devices, including the device that I failed. > > Event counter difference between the failed device and the other devices > > was 1, and then with my change the role of the problematic device was taken > > from the superblock of one of the higher devices, which indicated > > the role to be MD_DISK_ROLE_FAULTY. After array assembly completed, > > remove_and_add_spares() ejected the problematic disk from the array, > > as expected. > > --- > > drivers/md/md.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- > > 1 file changed, 43 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/md/md.c b/drivers/md/md.c > > index c94373d..ad68b5e 100644 > > --- a/drivers/md/md.c > > +++ b/drivers/md/md.c > > @@ -1195,6 +1195,7 @@ struct super_type { > > struct md_rdev *refdev, > > int minor_version); > > int (*validate_super)(struct mddev *mddev, > > + struct md_rdev *freshest, > > struct md_rdev *rdev); > > void (*sync_super)(struct mddev *mddev, > > struct md_rdev *rdev); > > @@ -1333,7 +1334,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor > > /* > > * validate_super for 0.90.0 > > */ > > -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) > > +static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) > > Please add a comment saying we are not using "freshest for 0.9 superblock. > > > { > > mdp_disk_t *desc; > > mdp_super_t *sb = page_address(rdev->sb_page); > > @@ -1845,7 +1846,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ > > return ret; > > } > > > > -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) > > +static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) > > { > > struct mdp_superblock_1 *sb = page_address(rdev->sb_page); > > __u64 ev1 = le64_to_cpu(sb->events); > > @@ -1941,13 +1942,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) > > } > > } else if (mddev->pers == NULL) { > > /* Insist of good event counter while assembling, except for > > - * spares (which don't need an event count) */ > > - ++ev1; > > + * spares (which don't need an event count). > > + * Similar to mdadm, we allow event counter difference of 1 > > + * from the freshest device. > > + */ > > if (rdev->desc_nr >= 0 && > > rdev->desc_nr < le32_to_cpu(sb->max_dev) && > > (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || > > le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) > > - if (ev1 < mddev->events) > > + if (ev1 + 1 < mddev->events) > > return -EINVAL; > > } else if (mddev->bitmap) { > > /* If adding to array with a bitmap, then we can accept an > > @@ -1968,8 +1971,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) > > rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { > > role = MD_DISK_ROLE_SPARE; > > rdev->desc_nr = -1; > > - } else > > + } else if (mddev->pers == NULL && freshest != NULL && ev1 < mddev->events) { > > + /* > > + * If we are assembling, and our event counter is smaller than the > > + * highest event counter, we cannot trust our superblock about the role. > > + * It could happen that our rdev was marked as Faulty, and all other > > + * superblocks were updated with +1 event counter. > > + * Then, before the next superblock update, which typically happens when > > + * remove_and_add_spares() removes the device from the array, there was > > + * a crash or reboot. > > + * If we allow current rdev without consulting the freshest superblock, > > + * we could cause data corruption. > > + * Note that in this case our event counter is smaller by 1 than the > > + * highest, otherwise, this rdev would not be allowed into array; > > + * both kernel and mdadm allow event counter difference of 1. > > + */ > > + struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); > > + u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); > > + > > + if (rdev->desc_nr >= freshest_max_dev) { > > + /* this is unexpected, better not proceed */ > > + pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", > > + mdname(mddev), rdev->bdev, rdev->desc_nr, freshest->bdev, > > There is probably some format issue here. checkpatch.pl should also > warn about it. > > Please address the feedback and send v2 of the patch. Thank you for the comments. I addressed them and sent out a v2. Thanks, Alex. > > Thanks, > Song > > > > + freshest_max_dev); > > + return -EUCLEAN; > > + } > > + > > + role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); > > + pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", > > + mdname(mddev), rdev->bdev, role, role, freshest->bdev); > > + } else { > > role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); > > + } > > switch(role) { > > case MD_DISK_ROLE_SPARE: /* spare */ > > break; > > @@ -2876,7 +2909,7 @@ static int add_bound_rdev(struct md_rdev *rdev) > > * and should be added immediately. > > */ > > super_types[mddev->major_version]. > > - validate_super(mddev, rdev); > > + validate_super(mddev, NULL/*freshest*/, rdev); > > err = mddev->pers->hot_add_disk(mddev, rdev); > > if (err) { > > md_kick_rdev_from_array(rdev); > > @@ -3813,7 +3846,7 @@ static int analyze_sbs(struct mddev *mddev) > > } > > > > super_types[mddev->major_version]. > > - validate_super(mddev, freshest); > > + validate_super(mddev, NULL/*freshest*/, freshest); > > > > i = 0; > > rdev_for_each_safe(rdev, tmp, mddev) { > > @@ -3828,7 +3861,7 @@ static int analyze_sbs(struct mddev *mddev) > > } > > if (rdev != freshest) { > > if (super_types[mddev->major_version]. > > - validate_super(mddev, rdev)) { > > + validate_super(mddev, freshest, rdev)) { > > pr_warn("md: kicking non-fresh %pg from array!\n", > > rdev->bdev); > > md_kick_rdev_from_array(rdev); > > @@ -6836,7 +6869,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) > > rdev->saved_raid_disk = rdev->raid_disk; > > } else > > super_types[mddev->major_version]. > > - validate_super(mddev, rdev); > > + validate_super(mddev, NULL/*freshest*/, rdev); > > if ((info->state & (1<<MD_DISK_SYNC)) && > > rdev->raid_disk != info->raid_disk) { > > /* This was a hot-add request, but events doesn't > > -- > > 1.9.1 > >