Hi, I can’t apply this on 6.10.5 and trying to manually reconstruct your patch lets me directly stumble into: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c14cf2410365..ce5466d4791a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2366,7 +2366,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, INIT_LIST_HEAD(&sh->lru); INIT_LIST_HEAD(&sh->r5c); INIT_LIST_HEAD(&sh->log_list); - atomic_set(&sh->count, 1); + atomic_set(&sh->count, 0); sh->raid_conf = conf; sh->log_start = MaxSector; Which version is your patch based on? Christian > On 26. Oct 2024, at 11:07, Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: > > Hi, > > 在 2024/10/26 13:37, Christian Theune 写道: >>> On 25. Oct 2024, at 16:02, Christian Theune <ct@xxxxxxxxxxxxxxx> wrote: >>> >>> Yeah, this was more directed towards the question whether Yu needs me to run the patch that he posted earlier. >>> >>> So. The current status is: previously this crashed within 2-3 hours. Both machines are now running with the bitmap turned off as described above and have been syncing data for about 7 hours. This seems to indicate that the bitmap is involved here. >> Update: both machines have been able to finish their multi-TiB rsync job that previously caused reliable lockups. So: the bitmap code seems to be the culprit here … >> Christian > > Then, can you enable bitmap and test the following debug patch: > > Thanks, > Kuai > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > index 58f71c3e1368..b2a75a904209 100644 > --- a/drivers/md/raid5.c > +++ b/drivers/md/raid5.c > @@ -2369,6 +2369,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, > atomic_set(&sh->count, 1); > sh->raid_conf = conf; > sh->log_start = MaxSector; > + atomic_set(&sh->bitmap_counts, 0); > > if (raid5_has_ppl(conf)) { > sh->ppl_page = alloc_page(gfp); > @@ -3565,6 +3566,7 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, > spin_unlock_irq(&sh->stripe_lock); > conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, > RAID5_STRIPE_SECTORS(conf), false); > + printk("%s: %s: start %px(%llu+%lu) %u\n", __func__, mdname(conf->mddev), sh, sh->sector, RAID5_STRIPE_SECTORS(conf), atomic_inc_return(&sh->bitmap_counts)); > spin_lock_irq(&sh->stripe_lock); > clear_bit(STRIPE_BITMAP_PENDING, &sh->state); > if (!sh->batch_head) { > @@ -3662,10 +3664,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, > bio_io_error(bi); > bi = nextbi; > } > - if (bitmap_end) > + if (bitmap_end) { > conf->mddev->bitmap_ops->endwrite(conf->mddev, > sh->sector, RAID5_STRIPE_SECTORS(conf), > false, false); > + printk("%s: %s: end %px(%llu+%lu) %u\n", __func__, mdname(conf->mddev), sh, sh->sector, RAID5_STRIPE_SECTORS(conf), atomic_dec_return(&sh->bitmap_counts)); > + } > bitmap_end = 0; > /* and fail all 'written' */ > bi = sh->dev[i].written; > @@ -3709,10 +3713,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, > bi = nextbi; > } > } > - if (bitmap_end) > + if (bitmap_end) { > conf->mddev->bitmap_ops->endwrite(conf->mddev, > sh->sector, RAID5_STRIPE_SECTORS(conf), > false, false); > + printk("%s: %s: end %px(%llu+%lu) %u\n", __func__, mdname(conf->mddev), sh, sh->sector, RAID5_STRIPE_SECTORS(conf), atomic_dec_return(&sh->bitmap_counts)); > + } > /* If we were in the middle of a write the parity block might > * still be locked - so just clear all R5_LOCKED flags > */ > @@ -4065,6 +4071,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, > sh->sector, RAID5_STRIPE_SECTORS(conf), > !test_bit(STRIPE_DEGRADED, &sh->state), > false); > + printk("%s: %s: end %px(%llu+%lu) %u\n", __func__, mdname(conf->mddev), sh, sh->sector, RAID5_STRIPE_SECTORS(conf), atomic_dec_return(&sh->bitmap_counts)); > if (head_sh->batch_head) { > sh = list_first_entry(&sh->batch_list, > struct stripe_head, > @@ -5785,9 +5792,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) > spin_unlock_irq(&sh->stripe_lock); > if (conf->mddev->bitmap) { > for (d = 0; d < conf->raid_disks - conf->max_degraded; > - d++) > + d++) { > mddev->bitmap_ops->startwrite(mddev, sh->sector, > RAID5_STRIPE_SECTORS(conf), false); > + printk("%s: %s: start %px(%llu+%lu) %u\n", __func__, mdname(conf->mddev), sh, sh->sector, RAID5_STRIPE_SECTORS(conf), atomic_inc_return(&sh->bitmap_counts)); > + } > sh->bm_seq = conf->seq_flush + 1; > set_bit(STRIPE_BIT_DELAY, &sh->state); > } > diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h > index 896ecfc4afa6..12024249245e 100644 > --- a/drivers/md/raid5.h > +++ b/drivers/md/raid5.h > @@ -255,6 +255,7 @@ struct stripe_head { > int nr_pages; /* page array size */ > int stripes_per_page; > #endif > + atomic_t bitmap_counts; > struct r5dev { > /* rreq and rvec are used for the replacement device when > * writing data to both devices. Liebe Grüße, Christian Theune -- Christian Theune · ct@xxxxxxxxxxxxxxx · +49 345 219401 0 Flying Circus Internet Operations GmbH · https://flyingcircus.io Leipziger Str. 70/71 · 06108 Halle (Saale) · Deutschland HR Stendal HRB 21169 · Geschäftsführer: Christian Theune, Christian Zagrodnick