Re: [PATCH] md/raid1: don't clear bitmap bits on interrupted recovery.

Song Liu <liu.song.a23@xxxxxxxxx> · Fri, 8 Feb 2019 09:17:27 -0800

On Thu, Feb 7, 2019 at 11:26 AM Nate Dailey <nate.dailey@xxxxxxxxxxx> wrote:
>
> sync_request_write no longer submits writes to a Faulty device. This has
> the unfortunate side effect that bitmap bits can be incorrectly cleared
> if a recovery is interrupted (previously, end_sync_write would have
> prevented this). This means the next recovery may not copy everything
> it should, potentially corrupting data.

Hi Nate,

Which commit introduces this bug? Could you please add a "Fixes" line
pointing to that commit?

Thanks,
Song

>
> Add a function for doing the proper md_bitmap_end_sync, called from
> end_sync_write and the Faulty case in sync_request_write.
>
> Signed-off-by: Nate Dailey <nate.dailey@xxxxxxxxxxx>
> ---
>  drivers/md/raid1.c | 28 ++++++++++++++++++----------
>  1 file changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 1d54109071cc..fa47249fa3e4 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1863,6 +1863,20 @@ static void end_sync_read(struct bio *bio)
>                 reschedule_retry(r1_bio);
>  }
>
> +static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
> +{
> +       sector_t sync_blocks = 0;
> +       sector_t s = r1_bio->sector;
> +       long sectors_to_go = r1_bio->sectors;
> +
> +       /* make sure these bits don't get cleared. */
> +       do {
> +               md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
> +               s += sync_blocks;
> +               sectors_to_go -= sync_blocks;
> +       } while (sectors_to_go > 0);
> +}
> +
>  static void end_sync_write(struct bio *bio)
>  {
>         int uptodate = !bio->bi_status;
> @@ -1874,15 +1888,7 @@ static void end_sync_write(struct bio *bio)
>         struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
>
>         if (!uptodate) {
> -               sector_t sync_blocks = 0;
> -               sector_t s = r1_bio->sector;
> -               long sectors_to_go = r1_bio->sectors;
> -               /* make sure these bits doesn't get cleared. */
> -               do {
> -                       md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
> -                       s += sync_blocks;
> -                       sectors_to_go -= sync_blocks;
> -               } while (sectors_to_go > 0);
> +               abort_sync_write(mddev, r1_bio);
>                 set_bit(WriteErrorSeen, &rdev->flags);
>                 if (!test_and_set_bit(WantReplacement, &rdev->flags))
>                         set_bit(MD_RECOVERY_NEEDED, &
> @@ -2172,8 +2178,10 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
>                      (i == r1_bio->read_disk ||
>                       !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
>                         continue;
> -               if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
> +               if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
> +                       abort_sync_write(mddev, r1_bio);
>                         continue;
> +               }
>
>                 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
>                 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
> --
> 2.19.1
>