Re: Experiencing md raid5 hang and CPU lockup on kernel v6.11

Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> · Thu, 7 Nov 2024 21:21:51 +0800

Hi,

在 2024/11/05 23:34, Haris Iqbal 写道:
On Tue, Nov 5, 2024 at 3:04 PM Dragan Milivojević <galileo@xxxxxxxxxxx> wrote:

On Tue, 5 Nov 2024 at 10:58, Haris Iqbal <haris.iqbal@xxxxxxxxx> wrote:

Hi,

I am running fio over a RDMA block device. The server side of this
mapping is an md-raid0 device, created over 3 md-raid5 devices.
The md-raid5 devices each are created over 8 block devices. Below is
how the raid configuration looks (md400, md300, md301 and md302 are
relevant for this discussion here).

Try disabling the bitmap as a quick "fix" and see if that helps.

Yes. Disabling bitmap does seem to prevent the hang completely. I ran
fio for 10 minutes and no hang.
Triggered the hang in 10 seconds after reverting back to internal bitmap.


Can you give the following patch a test? It's based on v6.11.

Thanks,
Kuai

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d3a837506a36..5e1a82b79e41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8753,6 +8753,30 @@ void md_submit_discard_bio(struct mddev *mddev, 
struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);

+static bool is_raid456(struct mddev *mddev)
+{
+       return mddev->pers->level == 4 || mddev->pers->level == 5 ||
+              mddev->pers->level == 6;
+}
+
+static void bitmap_startwrite(struct mddev *mddev, struct bio *bio)
+{
+       if (!is_raid456(mddev) || !mddev->bitmap)
+               return;
+
+       md_bitmap_startwrite(mddev->bitmap, bio_offset(bio), 
bio_sectors(bio),
+                            0);
+}
+
+static void bitmap_endwrite(struct mddev *mddev, struct bio *bio, 
sector_t sectors)
+{
+       if (!is_raid456(mddev) || !mddev->bitmap)
+               return;
+
+       md_bitmap_endwrite(mddev->bitmap, bio_offset(bio), sectors,o
+                          bio->bi_status == BLK_STS_OK, 0);
+}
+
 static void md_end_clone_io(struct bio *bio)
 {
        struct md_io_clone *md_io_clone = bio->bi_private;
@@ -8765,6 +8789,7 @@ static void md_end_clone_io(struct bio *bio)
        if (md_io_clone->start_time)
                bio_end_io_acct(orig_bio, md_io_clone->start_time);

+       bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
        bio_put(bio);
        bio_endio(orig_bio);
        percpu_ref_put(&mddev->active_io);
@@ -8778,6 +8803,7 @@ static void md_clone_bio(struct mddev *mddev, 
struct bio **bio)
                bio_alloc_clone(bdev, *bio, GFP_NOIO, 
&mddev->io_clone_set);

        md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+       md_io_clone->sectors = bio_sectors(*bio);
        md_io_clone->orig_bio = *bio;
        md_io_clone->mddev = mddev;
        if (blk_queue_io_stat(bdev->bd_disk->queue))
@@ -8790,6 +8816,7 @@ static void md_clone_bio(struct mddev *mddev, 
struct bio **bio)

 void md_account_bio(struct mddev *mddev, struct bio **bio)
 {
+       bitmap_startwrite(mddev, *bio);
        percpu_ref_get(&mddev->active_io);
        md_clone_bio(mddev, bio);
 }
@@ -8807,6 +8834,8 @@ void md_free_cloned_bio(struct bio *bio)
        if (md_io_clone->start_time)
                bio_end_io_acct(orig_bio, md_io_clone->start_time);

+       bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
+
        bio_put(bio);
        percpu_ref_put(&mddev->active_io);
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a0d6827dced9..0c2794230e0a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -837,6 +837,7 @@ struct md_io_clone {
        struct mddev    *mddev;
        struct bio      *orig_bio;
        unsigned long   start_time;
+       sector_t        sectors;
        struct bio      bio_clone;
 };
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c14cf2410365..4f009e32f68a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3561,12 +3561,6 @@ static void __add_stripe_bio(struct stripe_head 
*sh, struct bio *bi,
                 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
                 * any more.
                 */
-               set_bit(STRIPE_BITMAP_PENDING, &sh->state);
-               spin_unlock_irq(&sh->stripe_lock);
-               md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
-                                    RAID5_STRIPE_SECTORS(conf), 0);
-               spin_lock_irq(&sh->stripe_lock);
-               clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
                if (!sh->batch_head) {
                        sh->bm_seq = conf->seq_flush+1;
                        set_bit(STRIPE_BIT_DELAY, &sh->state);
@@ -3621,7 +3615,6 @@ handle_failed_stripe(struct r5conf *conf, struct 
stripe_head *sh,
        BUG_ON(sh->batch_head);
        for (i = disks; i--; ) {
                struct bio *bi;
-               int bitmap_end = 0;

                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
                        struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3646,8 +3639,6 @@ handle_failed_stripe(struct r5conf *conf, struct 
stripe_head *sh,
                sh->dev[i].towrite = NULL;
                sh->overwrite_disks = 0;
                spin_unlock_irq(&sh->stripe_lock);
-               if (bi)
-                       bitmap_end = 1;

                log_stripe_write_finished(sh);
@@ -3662,10 +3653,6 @@ handle_failed_stripe(struct r5conf *conf, struct 
stripe_head *sh,
                        bio_io_error(bi);
                        bi = nextbi;
                }
-               if (bitmap_end)
-                       md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-                                          RAID5_STRIPE_SECTORS(conf), 
0, 0);
-               bitmap_end = 0;
                /* and fail all 'written' */
                bi = sh->dev[i].written;
                sh->dev[i].written = NULL;
@@ -3674,7 +3661,6 @@ handle_failed_stripe(struct r5conf *conf, struct 
stripe_head *sh,
                        sh->dev[i].page = sh->dev[i].orig_page;
                }

-               if (bi) bitmap_end = 1;
                while (bi && bi->bi_iter.bi_sector <
                       sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
                        struct bio *bi2 = r5_next_bio(conf, bi, 
sh->dev[i].sector);
@@ -3708,9 +3694,6 @@ handle_failed_stripe(struct r5conf *conf, struct 
stripe_head *sh,
                                bi = nextbi;
                        }
                }
-               if (bitmap_end)
-                       md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-                                          RAID5_STRIPE_SECTORS(conf), 
0, 0);
                /* If we were in the middle of a write the parity block 
might
                 * still be locked - so just clear all R5_LOCKED flags
                 */
@@ -4059,10 +4042,6 @@ static void handle_stripe_clean_event(struct 
r5conf *conf,
                                        bio_endio(wbi);
                                        wbi = wbi2;
                                }
-                               md_bitmap_endwrite(conf->mddev->bitmap, 
sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
-                                                  0);
                                if (head_sh->batch_head) {
                                        sh = 
list_first_entry(&sh->batch_list,
                                                              struct 
stripe_head,
@@ -5788,13 +5767,6 @@ static void make_discard_request(struct mddev 
*mddev, struct bio *bi)
                }
                spin_unlock_irq(&sh->stripe_lock);
                if (conf->mddev->bitmap) {
-                       for (d = 0;
-                            d < conf->raid_disks - conf->max_degraded;
-                            d++)
-                               md_bitmap_startwrite(mddev->bitmap,
-                                                    sh->sector,
- RAID5_STRIPE_SECTORS(conf),
-                                                    0);
                        sh->bm_seq = conf->seq_flush + 1;
                        set_bit(STRIPE_BIT_DELAY, &sh->state);
                }
.