Re: Experiencing md raid5 hang and CPU lockup on kernel v6.11

Xiao Ni <xni@xxxxxxxxxx> · Sat, 9 Nov 2024 19:43:17 +0800

On Thu, Nov 7, 2024 at 9:22 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
>
> Hi,
>
> 在 2024/11/05 23:34, Haris Iqbal 写道:
> > On Tue, Nov 5, 2024 at 3:04 PM Dragan Milivojević <galileo@xxxxxxxxxxx> wrote:
> >>
> >> On Tue, 5 Nov 2024 at 10:58, Haris Iqbal <haris.iqbal@xxxxxxxxx> wrote:
> >>>
> >>> Hi,
> >>>
> >>> I am running fio over a RDMA block device. The server side of this
> >>> mapping is an md-raid0 device, created over 3 md-raid5 devices.
> >>> The md-raid5 devices each are created over 8 block devices. Below is
> >>> how the raid configuration looks (md400, md300, md301 and md302 are
> >>> relevant for this discussion here).
> >>
> >> Try disabling the bitmap as a quick "fix" and see if that helps.
> >
> > Yes. Disabling bitmap does seem to prevent the hang completely. I ran
> > fio for 10 minutes and no hang.
> > Triggered the hang in 10 seconds after reverting back to internal bitmap.
> >
>
> Can you give the following patch a test? It's based on v6.11.
>
> Thanks,
> Kuai
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index d3a837506a36..5e1a82b79e41 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8753,6 +8753,30 @@ void md_submit_discard_bio(struct mddev *mddev,
> struct md_rdev *rdev,
>   }
>   EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> +static bool is_raid456(struct mddev *mddev)
> +{
> +       return mddev->pers->level == 4 || mddev->pers->level == 5 ||
> +              mddev->pers->level == 6;
> +}
> +
> +static void bitmap_startwrite(struct mddev *mddev, struct bio *bio)
> +{
> +       if (!is_raid456(mddev) || !mddev->bitmap)
> +               return;
> +
> +       md_bitmap_startwrite(mddev->bitmap, bio_offset(bio),
> bio_sectors(bio),
> +                            0);
> +}
> +
> +static void bitmap_endwrite(struct mddev *mddev, struct bio *bio,
> sector_t sectors)
> +{
> +       if (!is_raid456(mddev) || !mddev->bitmap)
> +               return;
> +
> +       md_bitmap_endwrite(mddev->bitmap, bio_offset(bio), sectors,o
> +                          bio->bi_status == BLK_STS_OK, 0);
> +}
> +
>   static void md_end_clone_io(struct bio *bio)
>   {
>          struct md_io_clone *md_io_clone = bio->bi_private;
> @@ -8765,6 +8789,7 @@ static void md_end_clone_io(struct bio *bio)
>          if (md_io_clone->start_time)
>                  bio_end_io_acct(orig_bio, md_io_clone->start_time);
>
> +       bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
>          bio_put(bio);
>          bio_endio(orig_bio);
>          percpu_ref_put(&mddev->active_io);
> @@ -8778,6 +8803,7 @@ static void md_clone_bio(struct mddev *mddev,
> struct bio **bio)
>                  bio_alloc_clone(bdev, *bio, GFP_NOIO,
> &mddev->io_clone_set);
>
>          md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
> +       md_io_clone->sectors = bio_sectors(*bio);
>          md_io_clone->orig_bio = *bio;
>          md_io_clone->mddev = mddev;
>          if (blk_queue_io_stat(bdev->bd_disk->queue))
> @@ -8790,6 +8816,7 @@ static void md_clone_bio(struct mddev *mddev,
> struct bio **bio)
>
>   void md_account_bio(struct mddev *mddev, struct bio **bio)
>   {
> +       bitmap_startwrite(mddev, *bio);
>          percpu_ref_get(&mddev->active_io);
>          md_clone_bio(mddev, bio);
>   }
> @@ -8807,6 +8834,8 @@ void md_free_cloned_bio(struct bio *bio)
>          if (md_io_clone->start_time)
>                  bio_end_io_acct(orig_bio, md_io_clone->start_time);
>
> +       bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
> +
>          bio_put(bio);
>          percpu_ref_put(&mddev->active_io);
>   }
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index a0d6827dced9..0c2794230e0a 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -837,6 +837,7 @@ struct md_io_clone {
>          struct mddev    *mddev;
>          struct bio      *orig_bio;
>          unsigned long   start_time;
> +       sector_t        sectors;
>          struct bio      bio_clone;
>   };
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index c14cf2410365..4f009e32f68a 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3561,12 +3561,6 @@ static void __add_stripe_bio(struct stripe_head
> *sh, struct bio *bi,
>                   * is added to a batch, STRIPE_BIT_DELAY cannot be changed
>                   * any more.
>                   */
> -               set_bit(STRIPE_BITMAP_PENDING, &sh->state);
> -               spin_unlock_irq(&sh->stripe_lock);
> -               md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
> -                                    RAID5_STRIPE_SECTORS(conf), 0);
> -               spin_lock_irq(&sh->stripe_lock);
> -               clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
>                  if (!sh->batch_head) {
>                          sh->bm_seq = conf->seq_flush+1;
>                          set_bit(STRIPE_BIT_DELAY, &sh->state);
> @@ -3621,7 +3615,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>          BUG_ON(sh->batch_head);
>          for (i = disks; i--; ) {
>                  struct bio *bi;
> -               int bitmap_end = 0;
>
>                  if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
>                          struct md_rdev *rdev = conf->disks[i].rdev;
> @@ -3646,8 +3639,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                  sh->dev[i].towrite = NULL;
>                  sh->overwrite_disks = 0;
>                  spin_unlock_irq(&sh->stripe_lock);
> -               if (bi)
> -                       bitmap_end = 1;
>
>                  log_stripe_write_finished(sh);
> @@ -3662,10 +3653,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                          bio_io_error(bi);
>                          bi = nextbi;
>                  }
> -               if (bitmap_end)
> -                       md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
> -                                          RAID5_STRIPE_SECTORS(conf),
> 0, 0);
> -               bitmap_end = 0;
>                  /* and fail all 'written' */
>                  bi = sh->dev[i].written;
>                  sh->dev[i].written = NULL;
> @@ -3674,7 +3661,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                          sh->dev[i].page = sh->dev[i].orig_page;
>                  }
>
> -               if (bi) bitmap_end = 1;
>                  while (bi && bi->bi_iter.bi_sector <
>                         sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
>                          struct bio *bi2 = r5_next_bio(conf, bi,
> sh->dev[i].sector);
> @@ -3708,9 +3694,6 @@ handle_failed_stripe(struct r5conf *conf, struct
> stripe_head *sh,
>                                  bi = nextbi;
>                          }
>                  }
> -               if (bitmap_end)
> -                       md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
> -                                          RAID5_STRIPE_SECTORS(conf),
> 0, 0);
>                  /* If we were in the middle of a write the parity block
> might
>                   * still be locked - so just clear all R5_LOCKED flags
>                   */
> @@ -4059,10 +4042,6 @@ static void handle_stripe_clean_event(struct
> r5conf *conf,
>                                          bio_endio(wbi);
>                                          wbi = wbi2;
>                                  }
> -                               md_bitmap_endwrite(conf->mddev->bitmap,
> sh->sector,
> - RAID5_STRIPE_SECTORS(conf),
> - !test_bit(STRIPE_DEGRADED, &sh->state),
> -                                                  0);
>                                  if (head_sh->batch_head) {
>                                          sh =
> list_first_entry(&sh->batch_list,
>                                                                struct
> stripe_head,
> @@ -5788,13 +5767,6 @@ static void make_discard_request(struct mddev
> *mddev, struct bio *bi)
>                  }
>                  spin_unlock_irq(&sh->stripe_lock);
>                  if (conf->mddev->bitmap) {
> -                       for (d = 0;
> -                            d < conf->raid_disks - conf->max_degraded;
> -                            d++)
> -                               md_bitmap_startwrite(mddev->bitmap,
> -                                                    sh->sector,
> - RAID5_STRIPE_SECTORS(conf),
> -                                                    0);
>                          sh->bm_seq = conf->seq_flush + 1;
>                          set_bit(STRIPE_BIT_DELAY, &sh->state);
>                  }
> > .
> >
>
>

Hi all

I just replied against one raid5 stuck problem. This one looks like a
similar one. We have a customer who reports one similar problem. David
has a patch that can work. I place it in the attachment. Can you have
a try also? The patch can be applied cleanly on 6.11-rc6

Regards
Xiao
Attachment:
md_raid5_one_bitmap_claim_per_stripe_head.patch

Description: Binary data