On Sat, Nov 9, 2024 at 12:43 PM Xiao Ni <xni@xxxxxxxxxx> wrote: > > On Thu, Nov 7, 2024 at 9:22 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: > > > > Hi, > > > > 在 2024/11/05 23:34, Haris Iqbal 写道: > > > On Tue, Nov 5, 2024 at 3:04 PM Dragan Milivojević <galileo@xxxxxxxxxxx> wrote: > > >> > > >> On Tue, 5 Nov 2024 at 10:58, Haris Iqbal <haris.iqbal@xxxxxxxxx> wrote: > > >>> > > >>> Hi, > > >>> > > >>> I am running fio over a RDMA block device. The server side of this > > >>> mapping is an md-raid0 device, created over 3 md-raid5 devices. > > >>> The md-raid5 devices each are created over 8 block devices. Below is > > >>> how the raid configuration looks (md400, md300, md301 and md302 are > > >>> relevant for this discussion here). > > >> > > >> Try disabling the bitmap as a quick "fix" and see if that helps. > > > > > > Yes. Disabling bitmap does seem to prevent the hang completely. I ran > > > fio for 10 minutes and no hang. > > > Triggered the hang in 10 seconds after reverting back to internal bitmap. > > > > > > > Can you give the following patch a test? It's based on v6.11. > > > > Thanks, > > Kuai > > > > diff --git a/drivers/md/md.c b/drivers/md/md.c > > index d3a837506a36..5e1a82b79e41 100644 > > --- a/drivers/md/md.c > > +++ b/drivers/md/md.c > > @@ -8753,6 +8753,30 @@ void md_submit_discard_bio(struct mddev *mddev, > > struct md_rdev *rdev, > > } > > EXPORT_SYMBOL_GPL(md_submit_discard_bio); > > > > +static bool is_raid456(struct mddev *mddev) > > +{ > > + return mddev->pers->level == 4 || mddev->pers->level == 5 || > > + mddev->pers->level == 6; > > +} > > + > > +static void bitmap_startwrite(struct mddev *mddev, struct bio *bio) > > +{ > > + if (!is_raid456(mddev) || !mddev->bitmap) > > + return; > > + > > + md_bitmap_startwrite(mddev->bitmap, bio_offset(bio), > > bio_sectors(bio), > > + 0); > > +} > > + > > +static void bitmap_endwrite(struct mddev *mddev, struct bio *bio, > > sector_t sectors) > > +{ > > + if (!is_raid456(mddev) || !mddev->bitmap) > > + return; > > + > > + md_bitmap_endwrite(mddev->bitmap, bio_offset(bio), sectors,o > > + bio->bi_status == BLK_STS_OK, 0); > > +} > > + > > static void md_end_clone_io(struct bio *bio) > > { > > struct md_io_clone *md_io_clone = bio->bi_private; > > @@ -8765,6 +8789,7 @@ static void md_end_clone_io(struct bio *bio) > > if (md_io_clone->start_time) > > bio_end_io_acct(orig_bio, md_io_clone->start_time); > > > > + bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors); > > bio_put(bio); > > bio_endio(orig_bio); > > percpu_ref_put(&mddev->active_io); > > @@ -8778,6 +8803,7 @@ static void md_clone_bio(struct mddev *mddev, > > struct bio **bio) > > bio_alloc_clone(bdev, *bio, GFP_NOIO, > > &mddev->io_clone_set); > > > > md_io_clone = container_of(clone, struct md_io_clone, bio_clone); > > + md_io_clone->sectors = bio_sectors(*bio); > > md_io_clone->orig_bio = *bio; > > md_io_clone->mddev = mddev; > > if (blk_queue_io_stat(bdev->bd_disk->queue)) > > @@ -8790,6 +8816,7 @@ static void md_clone_bio(struct mddev *mddev, > > struct bio **bio) > > > > void md_account_bio(struct mddev *mddev, struct bio **bio) > > { > > + bitmap_startwrite(mddev, *bio); > > percpu_ref_get(&mddev->active_io); > > md_clone_bio(mddev, bio); > > } > > @@ -8807,6 +8834,8 @@ void md_free_cloned_bio(struct bio *bio) > > if (md_io_clone->start_time) > > bio_end_io_acct(orig_bio, md_io_clone->start_time); > > > > + bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors); > > + > > bio_put(bio); > > percpu_ref_put(&mddev->active_io); > > } > > diff --git a/drivers/md/md.h b/drivers/md/md.h > > index a0d6827dced9..0c2794230e0a 100644 > > --- a/drivers/md/md.h > > +++ b/drivers/md/md.h > > @@ -837,6 +837,7 @@ struct md_io_clone { > > struct mddev *mddev; > > struct bio *orig_bio; > > unsigned long start_time; > > + sector_t sectors; > > struct bio bio_clone; > > }; > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index c14cf2410365..4f009e32f68a 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -3561,12 +3561,6 @@ static void __add_stripe_bio(struct stripe_head > > *sh, struct bio *bi, > > * is added to a batch, STRIPE_BIT_DELAY cannot be changed > > * any more. > > */ > > - set_bit(STRIPE_BITMAP_PENDING, &sh->state); > > - spin_unlock_irq(&sh->stripe_lock); > > - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, > > - RAID5_STRIPE_SECTORS(conf), 0); > > - spin_lock_irq(&sh->stripe_lock); > > - clear_bit(STRIPE_BITMAP_PENDING, &sh->state); > > if (!sh->batch_head) { > > sh->bm_seq = conf->seq_flush+1; > > set_bit(STRIPE_BIT_DELAY, &sh->state); > > @@ -3621,7 +3615,6 @@ handle_failed_stripe(struct r5conf *conf, struct > > stripe_head *sh, > > BUG_ON(sh->batch_head); > > for (i = disks; i--; ) { > > struct bio *bi; > > - int bitmap_end = 0; > > > > if (test_bit(R5_ReadError, &sh->dev[i].flags)) { > > struct md_rdev *rdev = conf->disks[i].rdev; > > @@ -3646,8 +3639,6 @@ handle_failed_stripe(struct r5conf *conf, struct > > stripe_head *sh, > > sh->dev[i].towrite = NULL; > > sh->overwrite_disks = 0; > > spin_unlock_irq(&sh->stripe_lock); > > - if (bi) > > - bitmap_end = 1; > > > > log_stripe_write_finished(sh); > > @@ -3662,10 +3653,6 @@ handle_failed_stripe(struct r5conf *conf, struct > > stripe_head *sh, > > bio_io_error(bi); > > bi = nextbi; > > } > > - if (bitmap_end) > > - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, > > - RAID5_STRIPE_SECTORS(conf), > > 0, 0); > > - bitmap_end = 0; > > /* and fail all 'written' */ > > bi = sh->dev[i].written; > > sh->dev[i].written = NULL; > > @@ -3674,7 +3661,6 @@ handle_failed_stripe(struct r5conf *conf, struct > > stripe_head *sh, > > sh->dev[i].page = sh->dev[i].orig_page; > > } > > > > - if (bi) bitmap_end = 1; > > while (bi && bi->bi_iter.bi_sector < > > sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { > > struct bio *bi2 = r5_next_bio(conf, bi, > > sh->dev[i].sector); > > @@ -3708,9 +3694,6 @@ handle_failed_stripe(struct r5conf *conf, struct > > stripe_head *sh, > > bi = nextbi; > > } > > } > > - if (bitmap_end) > > - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, > > - RAID5_STRIPE_SECTORS(conf), > > 0, 0); > > /* If we were in the middle of a write the parity block > > might > > * still be locked - so just clear all R5_LOCKED flags > > */ > > @@ -4059,10 +4042,6 @@ static void handle_stripe_clean_event(struct > > r5conf *conf, > > bio_endio(wbi); > > wbi = wbi2; > > } > > - md_bitmap_endwrite(conf->mddev->bitmap, > > sh->sector, > > - RAID5_STRIPE_SECTORS(conf), > > - !test_bit(STRIPE_DEGRADED, &sh->state), > > - 0); > > if (head_sh->batch_head) { > > sh = > > list_first_entry(&sh->batch_list, > > struct > > stripe_head, > > @@ -5788,13 +5767,6 @@ static void make_discard_request(struct mddev > > *mddev, struct bio *bi) > > } > > spin_unlock_irq(&sh->stripe_lock); > > if (conf->mddev->bitmap) { > > - for (d = 0; > > - d < conf->raid_disks - conf->max_degraded; > > - d++) > > - md_bitmap_startwrite(mddev->bitmap, > > - sh->sector, > > - RAID5_STRIPE_SECTORS(conf), > > - 0); > > sh->bm_seq = conf->seq_flush + 1; > > set_bit(STRIPE_BIT_DELAY, &sh->state); > > } > > > . > > > > > > > > > Hi all > > I just replied against one raid5 stuck problem. This one looks like a > similar one. We have a customer who reports one similar problem. David > has a patch that can work. I place it in the attachment. Can you have > a try also? The patch can be applied cleanly on 6.11-rc6 > > Regards > Xiao Hello, I gave both the patches a try, and here are my findings. With the first patch by Yu, I did not see any hang or errors. I tried a number of bitmap chunk sizes, and ran fio for few hours, and there was no hang. With the second patch Xiao, I hit the following BUG_ON on the first minute of my fio run. [ 113.902982] Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 113.903315] CPU: 38 UID: 0 PID: 9767 Comm: kworker/38:3H Kdump: loaded Not tainted 6.11.5-storage #6.11.5-1+feature+v6.11+20241111.0643+cbe84cc3~deb12 [ 113.904120] Hardware name: Supermicro X10DRi/X10DRi, BIOS 3.3 03/03/2021 [ 113.904519] Workqueue: ib-comp-wq ib_cq_poll_work [ib_core] [ 113.904888] RIP: 0010:__add_stripe_bio+0x23f/0x250 [raid456] [ 113.905232] Code: 29 ff ff ff 41 8b 84 24 80 01 00 00 83 c0 01 89 45 54 f0 80 4d 49 02 e9 11 ff ff ff 45 85 c0 0f 84 4e fe ff ff e9 31 ff ff ff <0f [ 113.906352] RSP: 0018:ffffb5d30ed27aa0 EFLAGS: 00010006 [ 113.906661] RAX: ffff992cb9549818 RBX: 0000000000000000 RCX: 0000000000000001 [ 113.907086] RDX: ffff992c989c3158 RSI: ffff992c989c3a58 RDI: 0000000000000000 [ 113.907511] RBP: ffff991d19e923a0 R08: 0000000000000000 R09: 0000000000000160 [ 113.907936] R10: 0000000000000007 R11: ffffb5d30ed27b70 R12: ffff991d0854b800 [ 113.908361] R13: 0000000000000001 R14: ffff991d19e92718 R15: 0000000000000001 [ 113.908786] FS: 0000000000000000(0000) GS:ffff993c3fc80000(0000) knlGS:0000000000000000 [ 113.909267] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 113.909609] CR2: 00007f21b85473d8 CR3: 000000145d82c001 CR4: 00000000001706f0 [ 113.910034] Call Trace: [ 113.910181] <TASK [ 113.910304] ? die+0x36/0x90 [ 113.910478] ? do_trap+0xdd/0x100 [ 113.910675] ? __add_stripe_bio+0x23f/0x250 [raid456] [ 113.910979] ? do_error_trap+0x65/0x80 [ 113.911200] ? __add_stripe_bio+0x23f/0x250 [raid456] [ 113.911503] ? exc_invalid_op+0x50/0x70 [ 113.911731] ? __add_stripe_bio+0x23f/0x250 [raid456] [ 113.912033] ? asm_exc_invalid_op+0x1a/0x20 [ 113.912283] ? __add_stripe_bio+0x23f/0x250 [raid456] [ 113.912586] raid5_make_request+0x35f/0x1210 [raid456] [ 113.912896] ? submit_bio_noacct+0x47/0x4c0 [ 113.913145] ? __pfx_woken_wake_function+0x10/0x10 [ 113.913430] ? bio_split_rw+0x143/0x290 [ 113.913659] md_handle_request+0x156/0x270 [ 113.913905] __submit_bio+0x15c/0x1f0 [ 113.914126] submit_bio_noacct_nocheck+0x19a/0x3c0 [ 113.914412] ? submit_bio_noacct+0x47/0x4c0 [ 113.914662] rnbd_srv_rdma_ev+0x501/0xf70 [rnbd_server] [ 113.914976] ? rtrs_post_recv_empty+0x5d/0x80 [rtrs_core] [ 113.930375] process_io_req+0x169/0x4e0 [rtrs_server] [ 113.945660] __ib_process_cq+0x7b/0x170 [ib_core]