From: Yu Kuai <yukuai3@xxxxxxxxxx> Currently, 'active_io' is grabed before make_reqeust() is called, and it's dropped immediately make_reqeust() returns. Hence 'active_io' actually means io is dispatching, not io is inflight. For raid0 and raid456 that io accounting is enabled, 'active_io' will also be grabed when bio is cloned for io accounting, and this 'active_io' is dropped until io is done. Always clone new bio so that 'active_io' will mean that io is inflight, raid1 and raid10 will switch to use this method in later patches. Once these are implemented, it can be cleaned up that 'active_io' is grabed twice for one io. Now that bio will be cloned even if io accounting is disabled, also rename related structure from '*_acct_*' to '*_clone_*'. Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> --- drivers/md/md.c | 61 ++++++++++++++++++++++++---------------------- drivers/md/md.h | 4 +-- drivers/md/raid5.c | 18 +++++++------- 3 files changed, 43 insertions(+), 40 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 42347289195a..5ad8e7f3aebd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2314,7 +2314,7 @@ int md_integrity_register(struct mddev *mddev) pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), @@ -5886,9 +5886,9 @@ int md_run(struct mddev *mddev) goto exit_bio_set; } - if (!bioset_initialized(&mddev->io_acct_set)) { - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); + if (!bioset_initialized(&mddev->io_clone_set)) { + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); if (err) goto exit_sync_set; } @@ -6070,7 +6070,7 @@ int md_run(struct mddev *mddev) module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->io_acct_set); + bioset_exit(&mddev->io_clone_set); exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: @@ -6295,7 +6295,7 @@ static void __md_stop(struct mddev *mddev) percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_acct_set); + bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -8661,45 +8661,48 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); -static void md_end_io_acct(struct bio *bio) +static void md_end_clone_io(struct bio *bio) { - struct md_io_acct *md_io_acct = bio->bi_private; - struct bio *orig_bio = md_io_acct->orig_bio; - struct mddev *mddev = md_io_acct->mddev; + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; orig_bio->bi_status = bio->bi_status; - bio_end_io_acct(orig_bio, md_io_acct->start_time); + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + bio_put(bio); bio_endio(orig_bio); - percpu_ref_put(&mddev->active_io); } +static void md_clone_bio(struct mddev *mddev, struct bio **bio) +{ + struct block_device *bdev = (*bio)->bi_bdev; + struct md_io_clone *md_io_clone; + struct bio *clone = + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); + + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); + md_io_clone->orig_bio = *bio; + md_io_clone->mddev = mddev; + if (blk_queue_io_stat(bdev->bd_disk->queue)) + md_io_clone->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_clone_io; + clone->bi_private = md_io_clone; + *bio = clone; +} + /* * Used by personalities that don't already clone the bio and thus can't * easily add the timestamp to their extended bio structure. */ void md_account_bio(struct mddev *mddev, struct bio **bio) { - struct block_device *bdev = (*bio)->bi_bdev; - struct md_io_acct *md_io_acct; - struct bio *clone; - - if (!blk_queue_io_stat(bdev->bd_disk->queue)) - return; - percpu_ref_get(&mddev->active_io); - - clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); - md_io_acct = container_of(clone, struct md_io_acct, bio_clone); - md_io_acct->orig_bio = *bio; - md_io_acct->start_time = bio_start_io_acct(*bio); - md_io_acct->mddev = mddev; - - clone->bi_end_io = md_end_io_acct; - clone->bi_private = md_io_acct; - *bio = clone; + md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); diff --git a/drivers/md/md.h b/drivers/md/md.h index 11299d94b239..892a598a5029 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -510,7 +510,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ + struct bio_set io_clone_set; /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -738,7 +738,7 @@ struct md_thread { void *private; }; -struct md_io_acct { +struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 29cf5455d7a5..cef0b400b2ee 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct md_io_acct *md_io_acct = bi->bi_private; - struct bio *raid_bi = md_io_acct->orig_bio; + struct md_io_clone *md_io_clone = bi->bi_private; + struct bio *raid_bi = md_io_clone->orig_bio; struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; blk_status_t error = bi->bi_status; - unsigned long start_time = md_io_acct->start_time; + unsigned long start_time = md_io_clone->start_time; bio_put(bi); @@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; - struct md_io_acct *md_io_acct; + struct md_io_clone *md_io_clone; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) } align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, - &mddev->io_acct_set); - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + &mddev->io_clone_set); + md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone); raid_bio->bi_next = (void *)rdev; if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) - md_io_acct->start_time = bio_start_io_acct(raid_bio); - md_io_acct->orig_bio = raid_bio; + md_io_clone->start_time = bio_start_io_acct(raid_bio); + md_io_clone->orig_bio = raid_bio; align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = md_io_acct; + align_bio->bi_private = md_io_clone; align_bio->bi_iter.bi_sector = sector; /* No reshape active, so we can trust rdev->data_offset */ -- 2.39.2