Hi, I’m trying this with 6.11.7 today. Christian > On 9. Nov 2024, at 12:35, Xiao Ni <xni@xxxxxxxxxx> wrote: > > On Thu, Nov 7, 2024 at 3:55 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: >> >> Hi! >> >> 在 2024/11/06 14:40, Christian Theune 写道: >>> Hi, >>> >>>> On 6. Nov 2024, at 07:35, Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: >>>> >>>> Hi, >>>> >>>> 在 2024/11/05 18:15, Christian Theune 写道: >>>>> Hi, >>>>> after about 2 hours it stalled again. Here’s the full blocked process dump. (Tell me if this isn’t helpful, otherwise I’ll keep posting that as it’s the only real data I can show) >>>> >>>> This is bad news :( >>> >>> Yeah. But: the good new is that we aren’t eating any data so far … ;) >>> >>>> While reviewing related code, I come up with a plan to move bitmap >>>> start/end write ops to the upper layer. Make sure each write IO from >>>> upper layer only start once and end once, this is easy to make sure >>>> they are balanced and can avoid many calls to improve performance as >>>> well. >>> >>> Sounds like a plan! >>> >>>> However, I need a few days to cooke a patch after work. >>> >>> Sure thing! I’ll switch off bitmaps for that time - I’m happy we found a workaround so we can take time to resolve it cleanly. :) >> >> I wrote a simple and crude version, please give it a test again. >> >> Thanks, >> Kuai >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index d3a837506a36..5e1a82b79e41 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -8753,6 +8753,30 @@ void md_submit_discard_bio(struct mddev *mddev, >> struct md_rdev *rdev, >> } >> EXPORT_SYMBOL_GPL(md_submit_discard_bio); >> >> +static bool is_raid456(struct mddev *mddev) >> +{ >> + return mddev->pers->level == 4 || mddev->pers->level == 5 || >> + mddev->pers->level == 6; >> +} >> + >> +static void bitmap_startwrite(struct mddev *mddev, struct bio *bio) >> +{ >> + if (!is_raid456(mddev) || !mddev->bitmap) >> + return; >> + >> + md_bitmap_startwrite(mddev->bitmap, bio_offset(bio), >> bio_sectors(bio), >> + 0); >> +} >> + >> +static void bitmap_endwrite(struct mddev *mddev, struct bio *bio, >> sector_t sectors) >> +{ >> + if (!is_raid456(mddev) || !mddev->bitmap) >> + return; >> + >> + md_bitmap_endwrite(mddev->bitmap, bio_offset(bio), sectors,o >> + bio->bi_status == BLK_STS_OK, 0); >> +} >> + >> static void md_end_clone_io(struct bio *bio) >> { >> struct md_io_clone *md_io_clone = bio->bi_private; >> @@ -8765,6 +8789,7 @@ static void md_end_clone_io(struct bio *bio) >> if (md_io_clone->start_time) >> bio_end_io_acct(orig_bio, md_io_clone->start_time); >> >> + bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors); >> bio_put(bio); >> bio_endio(orig_bio); >> percpu_ref_put(&mddev->active_io); >> @@ -8778,6 +8803,7 @@ static void md_clone_bio(struct mddev *mddev, >> struct bio **bio) >> bio_alloc_clone(bdev, *bio, GFP_NOIO, >> &mddev->io_clone_set); >> >> md_io_clone = container_of(clone, struct md_io_clone, bio_clone); >> + md_io_clone->sectors = bio_sectors(*bio); >> md_io_clone->orig_bio = *bio; >> md_io_clone->mddev = mddev; >> if (blk_queue_io_stat(bdev->bd_disk->queue)) >> @@ -8790,6 +8816,7 @@ static void md_clone_bio(struct mddev *mddev, >> struct bio **bio) >> >> void md_account_bio(struct mddev *mddev, struct bio **bio) >> { >> + bitmap_startwrite(mddev, *bio); >> percpu_ref_get(&mddev->active_io); >> md_clone_bio(mddev, bio); >> } >> @@ -8807,6 +8834,8 @@ void md_free_cloned_bio(struct bio *bio) >> if (md_io_clone->start_time) >> bio_end_io_acct(orig_bio, md_io_clone->start_time); >> >> + bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors); >> + >> bio_put(bio); >> percpu_ref_put(&mddev->active_io); >> } >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index a0d6827dced9..0c2794230e0a 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -837,6 +837,7 @@ struct md_io_clone { >> struct mddev *mddev; >> struct bio *orig_bio; >> unsigned long start_time; >> + sector_t sectors; >> struct bio bio_clone; >> }; >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c >> index c14cf2410365..4f009e32f68a 100644 >> --- a/drivers/md/raid5.c >> +++ b/drivers/md/raid5.c >> @@ -3561,12 +3561,6 @@ static void __add_stripe_bio(struct stripe_head >> *sh, struct bio *bi, >> * is added to a batch, STRIPE_BIT_DELAY cannot be changed >> * any more. >> */ >> - set_bit(STRIPE_BITMAP_PENDING, &sh->state); >> - spin_unlock_irq(&sh->stripe_lock); >> - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, >> - RAID5_STRIPE_SECTORS(conf), 0); >> - spin_lock_irq(&sh->stripe_lock); >> - clear_bit(STRIPE_BITMAP_PENDING, &sh->state); >> if (!sh->batch_head) { >> sh->bm_seq = conf->seq_flush+1; >> set_bit(STRIPE_BIT_DELAY, &sh->state); >> @@ -3621,7 +3615,6 @@ handle_failed_stripe(struct r5conf *conf, struct >> stripe_head *sh, >> BUG_ON(sh->batch_head); >> for (i = disks; i--; ) { >> struct bio *bi; >> - int bitmap_end = 0; >> >> if (test_bit(R5_ReadError, &sh->dev[i].flags)) { >> struct md_rdev *rdev = conf->disks[i].rdev; >> @@ -3646,8 +3639,6 @@ handle_failed_stripe(struct r5conf *conf, struct >> stripe_head *sh, >> sh->dev[i].towrite = NULL; >> sh->overwrite_disks = 0; >> spin_unlock_irq(&sh->stripe_lock); >> - if (bi) >> - bitmap_end = 1; >> >> log_stripe_write_finished(sh); >> @@ -3662,10 +3653,6 @@ handle_failed_stripe(struct r5conf *conf, struct >> stripe_head *sh, >> bio_io_error(bi); >> bi = nextbi; >> } >> - if (bitmap_end) >> - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, >> - RAID5_STRIPE_SECTORS(conf), >> 0, 0); >> - bitmap_end = 0; >> /* and fail all 'written' */ >> bi = sh->dev[i].written; >> sh->dev[i].written = NULL; >> @@ -3674,7 +3661,6 @@ handle_failed_stripe(struct r5conf *conf, struct >> stripe_head *sh, >> sh->dev[i].page = sh->dev[i].orig_page; >> } >> >> - if (bi) bitmap_end = 1; >> while (bi && bi->bi_iter.bi_sector < >> sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { >> struct bio *bi2 = r5_next_bio(conf, bi, >> sh->dev[i].sector); >> @@ -3708,9 +3694,6 @@ handle_failed_stripe(struct r5conf *conf, struct >> stripe_head *sh, >> bi = nextbi; >> } >> } >> - if (bitmap_end) >> - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, >> - RAID5_STRIPE_SECTORS(conf), >> 0, 0); >> /* If we were in the middle of a write the parity block >> might >> * still be locked - so just clear all R5_LOCKED flags >> */ >> @@ -4059,10 +4042,6 @@ static void handle_stripe_clean_event(struct >> r5conf *conf, >> bio_endio(wbi); >> wbi = wbi2; >> } >> - md_bitmap_endwrite(conf->mddev->bitmap, >> sh->sector, >> - >> RAID5_STRIPE_SECTORS(conf), >> - >> !test_bit(STRIPE_DEGRADED, &sh->state), >> - 0); >> if (head_sh->batch_head) { >> sh = >> list_first_entry(&sh->batch_list, >> struct >> stripe_head, >> @@ -5788,13 +5767,6 @@ static void make_discard_request(struct mddev >> *mddev, struct bio *bi) >> } >> spin_unlock_irq(&sh->stripe_lock); >> if (conf->mddev->bitmap) { >> - for (d = 0; >> - d < conf->raid_disks - conf->max_degraded; >> - d++) >> - md_bitmap_startwrite(mddev->bitmap, >> - sh->sector, >> - >> RAID5_STRIPE_SECTORS(conf), >> - 0); >> sh->bm_seq = conf->seq_flush + 1; >> set_bit(STRIPE_BIT_DELAY, &sh->state); >> } >> >> >> >>> >>> Thanks a lot for your help! >>> Christian >>> >> >> > > Hi Kuai > > Maybe it's not good to put the bitmap operation from raid5 to md which > the new api is only used for raid5. And the bitmap region which raid5 > needs to handle is based on the member disk. It should be calculated > rather than the bio address space. Because the bio address space is > for the whole array. > > We have a customer who reports a similar problem. There is a patch > from David. I put it in the attachment. > > @Christian, can you have a try with the patch? It can be applied > cleanly on 6.11-rc6 > > Regards > Xiao > <md_raid5_one_bitmap_claim_per_stripe_head.patch> Liebe Grüße, Christian Theune -- Christian Theune · ct@xxxxxxxxxxxxxxx · +49 345 219401 0 Flying Circus Internet Operations GmbH · https://flyingcircus.io Leipziger Str. 70/71 · 06108 Halle (Saale) · Deutschland HR Stendal HRB 21169 · Geschäftsführer: Christian Theune, Christian Zagrodnick