Hi!
在 2024/11/06 14:40, Christian Theune 写道:
Hi,
On 6. Nov 2024, at 07:35, Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
Hi,
在 2024/11/05 18:15, Christian Theune 写道:
Hi,
after about 2 hours it stalled again. Here’s the full blocked process dump. (Tell me if this isn’t helpful, otherwise I’ll keep posting that as it’s the only real data I can show)
This is bad news :(
Yeah. But: the good new is that we aren’t eating any data so far … ;)
While reviewing related code, I come up with a plan to move bitmap
start/end write ops to the upper layer. Make sure each write IO from
upper layer only start once and end once, this is easy to make sure
they are balanced and can avoid many calls to improve performance as
well.
Sounds like a plan!
However, I need a few days to cooke a patch after work.
Sure thing! I’ll switch off bitmaps for that time - I’m happy we found a workaround so we can take time to resolve it cleanly. :)
I wrote a simple and crude version, please give it a test again.
Thanks,
Kuai
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d3a837506a36..5e1a82b79e41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8753,6 +8753,30 @@ void md_submit_discard_bio(struct mddev *mddev,
struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static bool is_raid456(struct mddev *mddev)
+{
+ return mddev->pers->level == 4 || mddev->pers->level == 5 ||
+ mddev->pers->level == 6;
+}
+
+static void bitmap_startwrite(struct mddev *mddev, struct bio *bio)
+{
+ if (!is_raid456(mddev) || !mddev->bitmap)
+ return;
+
+ md_bitmap_startwrite(mddev->bitmap, bio_offset(bio),
bio_sectors(bio),
+ 0);
+}
+
+static void bitmap_endwrite(struct mddev *mddev, struct bio *bio,
sector_t sectors)
+{
+ if (!is_raid456(mddev) || !mddev->bitmap)
+ return;
+
+ md_bitmap_endwrite(mddev->bitmap, bio_offset(bio), sectors,o
+ bio->bi_status == BLK_STS_OK, 0);
+}
+
static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
@@ -8765,6 +8789,7 @@ static void md_end_clone_io(struct bio *bio)
if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
+ bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
bio_put(bio);
bio_endio(orig_bio);
percpu_ref_put(&mddev->active_io);
@@ -8778,6 +8803,7 @@ static void md_clone_bio(struct mddev *mddev,
struct bio **bio)
bio_alloc_clone(bdev, *bio, GFP_NOIO,
&mddev->io_clone_set);
md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+ md_io_clone->sectors = bio_sectors(*bio);
md_io_clone->orig_bio = *bio;
md_io_clone->mddev = mddev;
if (blk_queue_io_stat(bdev->bd_disk->queue))
@@ -8790,6 +8816,7 @@ static void md_clone_bio(struct mddev *mddev,
struct bio **bio)
void md_account_bio(struct mddev *mddev, struct bio **bio)
{
+ bitmap_startwrite(mddev, *bio);
percpu_ref_get(&mddev->active_io);
md_clone_bio(mddev, bio);
}
@@ -8807,6 +8834,8 @@ void md_free_cloned_bio(struct bio *bio)
if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
+ bitmap_endwrite(mddev, orig_bio, md_io_clone->sectors);
+
bio_put(bio);
percpu_ref_put(&mddev->active_io);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a0d6827dced9..0c2794230e0a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -837,6 +837,7 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
+ sector_t sectors;
struct bio bio_clone;
};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c14cf2410365..4f009e32f68a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3561,12 +3561,6 @@ static void __add_stripe_bio(struct stripe_head
*sh, struct bio *bi,
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
* any more.
*/
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0);
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
@@ -3621,7 +3615,6 @@ handle_failed_stripe(struct r5conf *conf, struct
stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3646,8 +3639,6 @@ handle_failed_stripe(struct r5conf *conf, struct
stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
@@ -3662,10 +3653,6 @@ handle_failed_stripe(struct r5conf *conf, struct
stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
0, 0);
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3674,7 +3661,6 @@ handle_failed_stripe(struct r5conf *conf, struct
stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi,
sh->dev[i].sector);
@@ -3708,9 +3694,6 @@ handle_failed_stripe(struct r5conf *conf, struct
stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
0, 0);
/* If we were in the middle of a write the parity block
might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4059,10 +4042,6 @@ static void handle_stripe_clean_event(struct
r5conf *conf,
bio_endio(wbi);
wbi = wbi2;
}
- md_bitmap_endwrite(conf->mddev->bitmap,
sh->sector,
-
RAID5_STRIPE_SECTORS(conf),
-
!test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
if (head_sh->batch_head) {
sh =
list_first_entry(&sh->batch_list,
struct
stripe_head,
@@ -5788,13 +5767,6 @@ static void make_discard_request(struct mddev
*mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0;
- d < conf->raid_disks - conf->max_degraded;
- d++)
- md_bitmap_startwrite(mddev->bitmap,
- sh->sector,
-
RAID5_STRIPE_SECTORS(conf),
- 0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
Thanks a lot for your help!
Christian