On Thu, Nov 09, 2017 at 06:55:57PM +1100, Neil Brown wrote: > On Wed, Nov 08 2017, Shaohua Li wrote: > > > On Thu, Nov 02, 2017 at 09:40:50AM +0100, Tomasz Majchrzak wrote: > >> On Wed, Nov 01, 2017 at 12:13:22PM +1100, NeilBrown wrote: > >> > On Tue, Oct 31 2017, Tomasz Majchrzak wrote: > >> > > >> > > On Tue, Oct 17, 2017 at 04:18:36PM +1100, NeilBrown wrote: > >> > >> The ->recovery_offset shows how much of a non-InSync device is actually > >> > >> in sync - how much has been recoveryed. > >> > >> > >> > >> When performing a recovery, ->curr_resync and ->curr_resync_completed > >> > >> follow the device address being recovered and so can be used to update > >> > >> ->recovery_offset. > >> > >> > >> > >> When performing a reshape, ->curr_resync* might follow the device > >> > >> addresses (raid5) or might follow array addresses (raid10), so cannot > >> > >> in general be used to set ->recovery_offset. When reshaping backwards, > >> > >> ->curre_resync* measures from the *end* of the array-or-device, so is > >> > >> particularly unhelpful. > >> > >> > >> > >> So change the common code in md.c to only use ->curr_resync_complete > >> > >> for the simple recovery case, and add code to raid5.c to update > >> > >> ->recovery_offset during a forwards reshape. > >> > >> > >> > >> Signed-off-by: NeilBrown <neilb@xxxxxxxx> > >> > >> --- > >> > >> drivers/md/md.c | 32 +++++++++++++++++++++----------- > >> > >> drivers/md/raid5.c | 18 ++++++++++++++++++ > >> > >> 2 files changed, 39 insertions(+), 11 deletions(-) > >> > >> > >> > >> diff --git a/drivers/md/md.c b/drivers/md/md.c > >> > >> index a9f1352b3849..d1dfc9879368 100644 > >> > >> --- a/drivers/md/md.c > >> > >> +++ b/drivers/md/md.c > >> > >> @@ -2453,10 +2453,17 @@ void md_update_sb(struct mddev *mddev, int force_change) > >> > >> } > >> > >> } > >> > >> > >> > >> - /* First make sure individual recovery_offsets are correct */ > >> > >> + /* First make sure individual recovery_offsets are correct > >> > >> + * curr_resync_completed can only be used during recovery. > >> > >> + * During reshape/resync it might use array-addresses rather > >> > >> + * that device addresses. > >> > >> + */ > >> > >> rdev_for_each(rdev, mddev) { > >> > >> if (rdev->raid_disk >= 0 && > >> > >> mddev->delta_disks >= 0 && > >> > >> + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && > >> > >> + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && > >> > >> + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > >> > >> !test_bit(Journal, &rdev->flags) && > >> > >> !test_bit(In_sync, &rdev->flags) && > >> > >> mddev->curr_resync_completed > rdev->recovery_offset) > >> > >> @@ -8507,16 +8514,19 @@ void md_do_sync(struct md_thread *thread) > >> > >> } else { > >> > >> if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) > >> > >> mddev->curr_resync = MaxSector; > >> > >> - rcu_read_lock(); > >> > >> - rdev_for_each_rcu(rdev, mddev) > >> > >> - if (rdev->raid_disk >= 0 && > >> > >> - mddev->delta_disks >= 0 && > >> > >> - !test_bit(Journal, &rdev->flags) && > >> > >> - !test_bit(Faulty, &rdev->flags) && > >> > >> - !test_bit(In_sync, &rdev->flags) && > >> > >> - rdev->recovery_offset < mddev->curr_resync) > >> > >> - rdev->recovery_offset = mddev->curr_resync; > >> > >> - rcu_read_unlock(); > >> > >> + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > >> > >> + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { > >> > >> + rcu_read_lock(); > >> > >> + rdev_for_each_rcu(rdev, mddev) > >> > >> + if (rdev->raid_disk >= 0 && > >> > >> + mddev->delta_disks >= 0 && > >> > >> + !test_bit(Journal, &rdev->flags) && > >> > >> + !test_bit(Faulty, &rdev->flags) && > >> > >> + !test_bit(In_sync, &rdev->flags) && > >> > >> + rdev->recovery_offset < mddev->curr_resync) > >> > >> + rdev->recovery_offset = mddev->curr_resync; > >> > >> + rcu_read_unlock(); > >> > >> + } > >> > >> } > >> > >> } > >> > >> skip: > >> > >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > >> > >> index a8df52130f8a..89ad79614309 100644 > >> > >> --- a/drivers/md/raid5.c > >> > >> +++ b/drivers/md/raid5.c > >> > >> @@ -5739,6 +5739,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk > >> > >> */ > >> > >> struct r5conf *conf = mddev->private; > >> > >> struct stripe_head *sh; > >> > >> + struct md_rdev *rdev; > >> > >> sector_t first_sector, last_sector; > >> > >> int raid_disks = conf->previous_raid_disks; > >> > >> int data_disks = raid_disks - conf->max_degraded; > >> > >> @@ -5861,6 +5862,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk > >> > >> return 0; > >> > >> mddev->reshape_position = conf->reshape_progress; > >> > >> mddev->curr_resync_completed = sector_nr; > >> > >> + if (!mddev->reshape_backwards) > >> > >> + /* Can update recovery_offset */ > >> > >> + rdev_for_each(rdev, mddev) > >> > >> + if (rdev->raid_disk >= 0 && > >> > >> + !test_bit(Journal, &rdev->flags) && > >> > >> + !test_bit(In_sync, &rdev->flags) && > >> > >> + rdev->recovery_offset < sector_nr) > >> > >> + rdev->recovery_offset = sector_nr; > >> > >> + > >> > >> conf->reshape_checkpoint = jiffies; > >> > >> set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); > >> > >> md_wakeup_thread(mddev->thread); > >> > >> @@ -5959,6 +5969,14 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk > >> > >> goto ret; > >> > >> mddev->reshape_position = conf->reshape_progress; > >> > >> mddev->curr_resync_completed = sector_nr; > >> > >> + if (!mddev->reshape_backwards) > >> > >> + /* Can update recovery_offset */ > >> > >> + rdev_for_each(rdev, mddev) > >> > >> + if (rdev->raid_disk >= 0 && > >> > >> + !test_bit(Journal, &rdev->flags) && > >> > >> + !test_bit(In_sync, &rdev->flags) && > >> > >> + rdev->recovery_offset < sector_nr) > >> > >> + rdev->recovery_offset = sector_nr; > >> > >> conf->reshape_checkpoint = jiffies; > >> > >> set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); > >> > >> md_wakeup_thread(mddev->thread); > >> > >> > >> > >> > >> > >> -- > >> > > > >> > > Hi, > >> > > > >> > > This patch has caused a regression in following IMSM RAID scenario: > >> > > >> > That's unfortunate. > >> > > >> > > > >> > > mdadm --create /dev/md/imsm0 --metadata=imsm --raid-devices=4 /dev/nvme[0-3]n1 --run > >> > > mdadm --create /dev/md/r10 --level=10 --size=100M --raid-devices=4 /dev/nvme[0-3]n1 --run --assume-clean > >> > > > >> > > MDADM_EXPERIMENTAL=1 mdadm --grow /dev/md/r10 --level=0 > >> > > MDADM_EXPERIMENTAL=1 mdadm --grow /dev/md/imsm0 --raid-devices=4 > >> > > > >> > > cat /proc/mdstat > >> > > Personalities : [raid10] [raid0] [raid6] [raid5] [raid4] > >> > > md126 : active raid4 nvme0n1[3] nvme2n1[2] nvme1n1[1] > >> > > 409600 blocks super external:/md127/0 level 4, 128k chunk, algorithm 0 [5/3] [UU_U_] > >> > > > >> > > md127 : inactive nvme3n1[3](S) nvme2n1[2](S) nvme1n1[1](S) nvme0n1[0](S) > >> > > 4420 blocks super external:imsm > >> > > >> > Why isn't nvme3n1 in md126? Did it not get added, or did it get removed > >> > for some reason? > >> > If it wasn't added, raid5_start_reshape() would have failed, so I guess > >> > it must have been removed?? > >> > >> It got removed. It's in the array during the reshape: > >> > >> cat /proc/mdstat > >> Personalities : [raid10] [raid0] [raid6] [raid5] [raid4] [raid1] > >> md126 : active raid4 nvme3n1[4] nvme0n1[3] nvme2n1[2] nvme1n1[1] > >> 4194304 blocks super external:-md127/0 level 4, 128k chunk, algorithm 5 [5/4] [UU_U_] > >> [===>.................] reshape = 16.6% (350476/2097152) finish=0.1min speed=175238K/sec > >> > >> md127 : inactive nvme3n1[3](S) nvme2n1[2](S) nvme1n1[1](S) nvme0n1[0](S) > >> 4420 blocks super external:imsm > >> > >> > > >> > > > >> > > The array should have been reshaped to 4-disk RAID0 but it has failed in > >> > > the middle of reshape as RAID4. > >> > > > >> > > Following condition is not true for above scenario: > >> > > > >> > >> + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > >> > >> + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { > >> > > >> > No, it isn't meant to be. This code doesn't work reliably in the > >> > reshape case. When reshaping raid5 (or raid4), reshape_request updates > >> > rdev->recovery_offset. ... maybe end_reshape needs to update it too. > >> > > >> > If you add this patch, does it help? > >> > > >> > > >> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > >> > index 89ad79614309..a99ae1be9175 100644 > >> > --- a/drivers/md/raid5.c > >> > +++ b/drivers/md/raid5.c > >> > @@ -7964,6 +7964,7 @@ static void end_reshape(struct r5conf *conf) > >> > { > >> > > >> > if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { > >> > + struct md_rdev *rdev; > >> > > >> > spin_lock_irq(&conf->device_lock); > >> > conf->previous_raid_disks = conf->raid_disks; > >> > @@ -7971,6 +7972,12 @@ static void end_reshape(struct r5conf *conf) > >> > smp_wmb(); > >> > conf->reshape_progress = MaxSector; > >> > conf->mddev->reshape_position = MaxSector; > >> > + rdev_for_each(rdev, conf->mddev) > >> > + if (rdev->raid_disk >= 0 && > >> > + !test_bit(Journal, &rdev->flags) && > >> > + !test_bit(In_sync, &rdev->flags)) > >> > + rdev->recovery_offset = MaxSector; > >> > + > >> > spin_unlock_irq(&conf->device_lock); > >> > wake_up(&conf->wait_for_overlap); > >> > > >> > >> Yes, this patch fixes the problem. I have also checked that reshape position > >> is really stored and reshape interrupted in the middle restarts from the > >> last checkpoint. > > > > Neil, > > can you send me a formal patch for this? or I can just fold this into your > > original patch. > > If you can fold it in, that would be great. Otherwise it may be a while > before I get to it. Ok, done. -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html