From: Luca Miccio <lucmiccio@xxxxxxxxx> BFQ currently creates, and updates, its own instance of the whole set of blkio statistics that cfq creates. Yet, from the comments of Tejun Heo in [1], it turned out that most of these statistics are meant/useful only for debugging. This commit makes BFQ create the latter, debugging statistics only if the option CONFIG_DEBUG_BLK_CGROUP is set. By doing so, this commit also enables BFQ to enjoy a high perfomance boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then BFQ has to update far fewer statistics, and, in particular, not the heaviest to update. To give an idea of the benefits, if CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and with 8 threads doing random I/O in parallel on null_blk (configured with 0 latency), the throughput of BFQ grows from 310 to 400 KIOPS (+30%). We have measured similar or even much higher boosts with other CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have been obtained and can be reproduced very easily with the script in [1]. [1] https://www.spinics.net/lists/linux-block/msg18943.html Suggested-by: Tejun Heo <tj@xxxxxxxxxx> Suggested-by: Ulf Hansson <ulf.hansson@xxxxxxxxxx> Tested-by: Lee Tibbert <lee.tibbert@xxxxxxxxx> Tested-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx> Signed-off-by: Luca Miccio <lucmiccio@xxxxxxxxx> Signed-off-by: Paolo Valente <paolo.valente@xxxxxxxxxx> --- Documentation/block/bfq-iosched.txt | 38 +++++++-- block/bfq-cgroup.c | 148 ++++++++++++++++++++---------------- block/bfq-iosched.c | 14 ++-- block/bfq-iosched.h | 4 +- 4 files changed, 125 insertions(+), 79 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 7fad6c0..8d8d8f0 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -20,12 +20,22 @@ for that device, by setting low_latency to 0. See Section 3 for details on how to configure BFQ for the desired tradeoff between latency and throughput, or on how to maximize throughput. -BFQ has a non-null overhead, which limits the maximum IOPS that the -CPU can process for a device scheduled with BFQ. To give an idea of -the limits on slow or average CPUs, here are BFQ limits for three -different CPUs, on, respectively, an average laptop, an old desktop, -and a cheap embedded system, in case full hierarchical support is -enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set): +BFQ has a non-null overhead, which limits the maximum IOPS that a CPU +can process for a device scheduled with BFQ. To give an idea of the +limits on slow or average CPUs, here are, first, the limits of BFQ for +three different CPUs, on, respectively, an average laptop, an old +desktop, and a cheap embedded system, in case full hierarchical +support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but +CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): +- Intel i7-4850HQ: 400 KIOPS +- AMD A8-3850: 250 KIOPS +- ARM CortexTM-A53 Octa-core: 80 KIOPS + +If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical +support is enabled), then the sustainable throughput with BFQ +decreases, because all blkio.bfq* statistics are created and updated +(Section 4-2). For BFQ, this leads to the following maximum +sustainable throughputs, on the same systems as above: - Intel i7-4850HQ: 310 KIOPS - AMD A8-3850: 200 KIOPS - ARM CortexTM-A53 Octa-core: 56 KIOPS @@ -505,6 +515,22 @@ BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group parameter to set the weight of a group with BFQ is blkio.bfq.weight or io.bfq.weight. +As for cgroups-v1 (blkio controller), the exact set of stat files +created, and kept up-to-date by bfq, depends on whether +CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all +the stat files documented in +Documentation/cgroup-v1/blkio-controller.txt. If, instead, +CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files +blkio.bfq.io_service_bytes +blkio.bfq.io_service_bytes_recursive +blkio.bfq.io_serviced +blkio.bfq.io_serviced_recursive + +The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum +throughput sustainable with bfq, because updating the blkio.bfq.* +stats is rather costly, especially for some of the stats enabled by +CONFIG_DEBUG_BLK_CGROUP. + Parameters to set ----------------- diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index ceefb9a..da1525e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -24,7 +24,7 @@ #include "bfq-iosched.h" -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* bfqg stats flags */ enum bfqg_stats_flags { @@ -152,6 +152,57 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) bfqg_stats_update_group_wait_time(stats); } +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, 1); + bfqg_stats_end_empty_time(&bfqg->stats); + if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +} + +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, -1); +} + +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.merged, op, 1); +} + +void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, + uint64_t io_start_time, unsigned int op) +{ + struct bfqg_stats *stats = &bfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, op, + io_start_time - start_time); +} + +#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op) { } +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, + uint64_t io_start_time, unsigned int op) { } +void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + +#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + /* * blk-cgroup policy-related handlers * The following functions help in converting between blk-cgroup @@ -229,42 +280,10 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) blkg_put(bfqg_to_blkg(bfqg)); } -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -} - -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, -1); -} - -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.merged, op, 1); -} - -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op) -{ - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, - now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); -} - /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { +#ifdef CONFIG_DEBUG_BLK_CGROUP /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); @@ -276,6 +295,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) blkg_stat_reset(&stats->group_wait_time); blkg_stat_reset(&stats->idle_time); blkg_stat_reset(&stats->empty_time); +#endif } /* @to += @from */ @@ -284,6 +304,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) if (!to || !from) return; +#ifdef CONFIG_DEBUG_BLK_CGROUP /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); @@ -296,6 +317,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); blkg_stat_add_aux(&to->idle_time, &from->idle_time); blkg_stat_add_aux(&to->empty_time, &from->empty_time); +#endif } /* @@ -342,6 +364,7 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { +#ifdef CONFIG_DEBUG_BLK_CGROUP blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); @@ -353,10 +376,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) blkg_stat_exit(&stats->group_wait_time); blkg_stat_exit(&stats->idle_time); blkg_stat_exit(&stats->empty_time); +#endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { +#ifdef CONFIG_DEBUG_BLK_CGROUP if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || @@ -371,6 +396,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfqg_stats_exit(stats); return -ENOMEM; } +#endif return 0; } @@ -887,6 +913,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return bfq_io_set_weight_legacy(of_css(of), NULL, weight); } +#ifdef CONFIG_DEBUG_BLK_CGROUP static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -991,6 +1018,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) 0, false); return 0; } +#endif /* CONFIG_DEBUG_BLK_CGROUP */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { @@ -1029,15 +1057,6 @@ struct cftype bfq_blkcg_legacy_files[] = { /* statistics, covers only the tasks in the bfqg */ { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.sectors", - .seq_show = bfqg_print_stat_sectors, - }, - { .name = "bfq.io_service_bytes", .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_bytes, @@ -1047,6 +1066,16 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios, }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "bfq.time", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.sectors", + .seq_show = bfqg_print_stat_sectors, + }, { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), @@ -1067,18 +1096,10 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ /* the same statictics which cover the bfqg and its descendants */ { - .name = "bfq.time_recursive", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.sectors_recursive", - .seq_show = bfqg_print_stat_sectors_recursive, - }, - { .name = "bfq.io_service_bytes_recursive", .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_bytes_recursive, @@ -1088,6 +1109,16 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios_recursive, }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "bfq.time_recursive", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = "bfq.sectors_recursive", + .seq_show = bfqg_print_stat_sectors_recursive, + }, { .name = "bfq.io_service_time_recursive", .private = offsetof(struct bfq_group, stats.service_time), @@ -1132,6 +1163,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; @@ -1147,18 +1179,6 @@ struct cftype bfq_blkg_files[] = { #else /* CONFIG_BFQ_GROUP_IOSCHED */ -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) { } -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op) { } -void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } - void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 69e05f8..bcb6d21 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3693,14 +3693,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *in_serv_queue, *bfqq; bool waiting_rq, idle_timer_disabled; #endif spin_lock_irq(&bfqd->lock); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) in_serv_queue = bfqd->in_service_queue; waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); @@ -3714,7 +3714,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) #endif spin_unlock_irq(&bfqd->lock); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) return rq; @@ -4281,7 +4281,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *bfqq = RQ_BFQQ(rq); bool idle_timer_disabled = false; unsigned int cmd_flags; @@ -4304,7 +4304,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4323,7 +4323,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request @@ -4333,7 +4333,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, #endif spin_unlock_irq(&bfqd->lock); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) if (!bfqq) return; /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ac0809c..91c4390 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -689,7 +689,7 @@ enum bfqq_expiration { }; struct bfqg_stats { -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -717,7 +717,7 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ +#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ }; #ifdef CONFIG_BFQ_GROUP_IOSCHED -- 2.10.0