Re: PROBLEM: repeatable lockup on RAID-6 with LUKS dm-crypt on NVMe devices when rsyncing many files

Xiao Ni <xni@xxxxxxxxxx> · Sat, 9 Nov 2024 19:35:02 +0800

On Thu, Nov 7, 2024 at 3:55 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
>
> Hi!
>
> 在 2024/11/06 14:40, Christian Theune 写道:
> > Hi,
> >
> >> On 6. Nov 2024, at 07:35, Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
> >>
> >> Hi,
> >>
> >> 在 2024/11/05 18:15, Christian Theune 写道:
> >>> Hi,
> >>> after about 2 hours it stalled again. Here’s the full blocked process dump. (Tell me if this isn’t helpful, otherwise I’ll keep posting that as it’s the only real data I can show)
> >>
> >> This is bad news :(
> >
> > Yeah. But: the good new is that we aren’t eating any data so far … ;)
> >
> >> While reviewing related code, I come up with a plan to move bitmap
> >> start/end write ops to the upper layer. Make sure each write IO from
> >> upper layer only start once and end once, this is easy to make sure
> >> they are balanced and can avoid many calls to improve performance as
> >> well.
> >
> > Sounds like a plan!
> >
> >> However, I need a few days to cooke a patch after work.
> >
> > Sure thing! I’ll switch off bitmaps for that time - I’m happy we found a workaround so we can take time to resolve it cleanly. :)
>
> I wrote a simple and crude version, please give it a test again.
>
> Thanks,
> Kuai
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index d3a837506a36..5e1a82b79e41 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8753,6 +8753,30 @@ void md_submit_discard_bio(struct mddev *mddev,
> struct md_rdev *rdev,
>   }
>   EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> +static bool is_raid456(struct mddev *mddev)
> +{
> +       return mddev->pers->level == 4 || mddev->pers->level == 5 ||
> +              mddev->pers->level == 6;
> +}
> +
> +static void bitmap_startwrite(struct mddev *mddev, struct bio *bio)
> +{
> +       if (!is_raid456(mddev) || !mddev->bitmap)
> +               return;
> +
> +       md_bitmap_startwrite(mddev->bitmap, bio_offset(bio),
> bio_sectors(bio),
> +                            0);
> +}
> +
> +static void bitmap_endwrite(struct mddev *mddev, struct bio *bio,
> sector_t sectors)
> +{
> +       if (!is_raid456(mddev) || !mddev->bitmap)
> +               return;
> +
> +       md_bitmap_endwrite(mddev->bitmap, bio_offset(bio), sectors,o
> +                          bio->bi_status == BLK_STS_OK, 0);
> +}
> +
>   static void md_end_clone_io(struct bio *bio)
>   {
>          struct md_io_clone *md_io_clone = bio->bi_private;
> @@ -8765,6 +8789,7 @@ static void md_end_clone_io(struct bio *bio)
>          if (md_io_clone->start_time)
>                  bio_end_io_acct(orig_bio, md_io_clone->start_time);
>
> +       bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
>          bio_put(bio);
>          bio_endio(orig_bio);
>          percpu_ref_put(&mddev->active_io);
> @@ -8778,6 +8803,7 @@ static void md_clone_bio(struct mddev *mddev,
> struct bio **bio)
>                  bio_alloc_clone(bdev, *bio, GFP_NOIO,
> &mddev->io_clone_set);
>
>          md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
> +       md_io_clone->sectors = bio_sectors(*bio);
>          md_io_clone->orig_bio = *bio;
>          md_io_clone->mddev = mddev;
>          if (blk_queue_io_stat(bdev->bd_disk->queue))
> @@ -8790,6 +8816,7 @@ static void md_clone_bio(struct mddev *mddev,
> struct bio **bio)
>
>   void md_account_bio(struct mddev *mddev, struct bio **bio)
>   {
> +       bitmap_startwrite(mddev, *bio);
>          percpu_ref_get(&mddev->active_io);
>          md_clone_bio(mddev, bio);
>   }
> @@ -8807,6 +8834,8 @@ void md_free_cloned_bio(struct bio *bio)
>          if (md_io_clone->start_time)
>                  bio_end_io_acct(orig_bio, md_io_clone->start_time);
>
> +       bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
> +
>          bio_put(bio);
>          percpu_ref_put(&mddev->active_io);
>   }
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index a0d6827dced9..0c2794230e0a 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -837,6 +837,7 @@ struct md_io_clone {
>          struct mddev    *mddev;
>          struct bio      *orig_bio;
>          unsigned long   start_time;
> +       sector_t        sectors;
>          struct bio      bio_clone;
>   };
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index c14cf2410365..4f009e32f68a 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3561,12 +3561,6 @@ static void __add_stripe_bio(struct stripe_head
> *sh, struct bio *bi,
>                   * is added to a batch, STRIPE_BIT_DELAY cannot be changed
>                   * any more.
>                   */
> -               set_bit(STRIPE_BITMAP_PENDING, &sh->state);
> -               spin_unlock_irq(&sh->stripe_lock);
> -               md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
> -                                    RAID5_STRIPE_SECTORS(conf), 0);
> -               spin_lock_irq(&sh->stripe_lock);
> -               clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
>                  if (!sh->batch_head) {
>                          sh->bm_seq = conf->seq_flush+1;
>                          set_bit(STRIPE_BIT_DELAY, &sh->state);
> @@ -3621,7 +3615,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>          BUG_ON(sh->batch_head);
>          for (i = disks; i--; ) {
>                  struct bio *bi;
> -               int bitmap_end = 0;
>
>                  if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
>                          struct md_rdev *rdev = conf->disks[i].rdev;
> @@ -3646,8 +3639,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                  sh->dev[i].towrite = NULL;
>                  sh->overwrite_disks = 0;
>                  spin_unlock_irq(&sh->stripe_lock);
> -               if (bi)
> -                       bitmap_end = 1;
>
>                  log_stripe_write_finished(sh);
> @@ -3662,10 +3653,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                          bio_io_error(bi);
>                          bi = nextbi;
>                  }
> -               if (bitmap_end)
> -                       md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
> -                                          RAID5_STRIPE_SECTORS(conf),
> 0, 0);
> -               bitmap_end = 0;
>                  /* and fail all 'written' */
>                  bi = sh->dev[i].written;
>                  sh->dev[i].written = NULL;
> @@ -3674,7 +3661,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                          sh->dev[i].page = sh->dev[i].orig_page;
>                  }
>
> -               if (bi) bitmap_end = 1;
>                  while (bi && bi->bi_iter.bi_sector <
>                         sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
>                          struct bio *bi2 = r5_next_bio(conf, bi,
> sh->dev[i].sector);
> @@ -3708,9 +3694,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                                  bi = nextbi;
>                          }
>                  }
> -               if (bitmap_end)
> -                       md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
> -                                          RAID5_STRIPE_SECTORS(conf),
> 0, 0);
>                  /* If we were in the middle of a write the parity block
> might
>                   * still be locked - so just clear all R5_LOCKED flags
>                   */
> @@ -4059,10 +4042,6 @@ static void handle_stripe_clean_event(struct
> r5conf *conf,
>                                          bio_endio(wbi);
>                                          wbi = wbi2;
>                                  }
> -                               md_bitmap_endwrite(conf->mddev->bitmap,
> sh->sector,
> -
> RAID5_STRIPE_SECTORS(conf),
> -
> !test_bit(STRIPE_DEGRADED, &sh->state),
> -                                                  0);
>                                  if (head_sh->batch_head) {
>                                          sh =
> list_first_entry(&sh->batch_list,
>                                                                struct
> stripe_head,
> @@ -5788,13 +5767,6 @@ static void make_discard_request(struct mddev
> *mddev, struct bio *bi)
>                  }
>                  spin_unlock_irq(&sh->stripe_lock);
>                  if (conf->mddev->bitmap) {
> -                       for (d = 0;
> -                            d < conf->raid_disks - conf->max_degraded;
> -                            d++)
> -                               md_bitmap_startwrite(mddev->bitmap,
> -                                                    sh->sector,
> -
> RAID5_STRIPE_SECTORS(conf),
> -                                                    0);
>                          sh->bm_seq = conf->seq_flush + 1;
>                          set_bit(STRIPE_BIT_DELAY, &sh->state);
>                  }
>
>
>
> >
> > Thanks a lot for your help!
> > Christian
> >
>
>

Hi Kuai

Maybe it's not good to put the bitmap operation from raid5 to md which
the new api is only used for raid5. And the bitmap region which raid5
needs to handle is based on the member disk. It should be calculated
rather than the bio address space. Because the bio address space is
for the whole array.

We have a customer who reports a similar problem. There is a patch
from David. I put it in the attachment.

@Christian, can you have a try with the patch? It can be applied
cleanly on 6.11-rc6

Regards
Xiao
Attachment:
md_raid5_one_bitmap_claim_per_stripe_head.patch

Description: Binary data