Maintain statistics per cgroup and export these to user space. These statistics are essential for verifying whether the proper I/O priorities have been assigned to requests. An example of the statistics data with this patch applied: $ cat /sys/fs/cgroup/io.stat 11:2 rbytes=0 wbytes=0 rios=3 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 8:32 rbytes=2142720 wbytes=0 rios=105 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 Cc: Damien Le Moal <damien.lemoal@xxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Ming Lei <ming.lei@xxxxxxxxxx> Cc: Johannes Thumshirn <johannes.thumshirn@xxxxxxx> Cc: Himanshu Madhani <himanshu.madhani@xxxxxxxxxx> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> --- block/Kconfig.iosched | 6 + block/Makefile | 2 + block/mq-deadline-cgroup.c | 126 ++++++++++++++++++++ block/mq-deadline-cgroup.h | 114 ++++++++++++++++++ block/{mq-deadline.c => mq-deadline-main.c} | 74 +++++++++--- 5 files changed, 308 insertions(+), 14 deletions(-) create mode 100644 block/mq-deadline-cgroup.c create mode 100644 block/mq-deadline-cgroup.h rename block/{mq-deadline.c => mq-deadline-main.c} (95%) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a91..64053d67a97b 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -9,6 +9,12 @@ config MQ_IOSCHED_DEADLINE help MQ version of the deadline IO scheduler. +config MQ_IOSCHED_DEADLINE_CGROUP + tristate + default y + depends on MQ_IOSCHED_DEADLINE + depends on BLK_CGROUP + config MQ_IOSCHED_KYBER tristate "Kyber I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index af3d044abaf1..b9db5d4edfc8 100644 --- a/block/Makefile +++ b/block/Makefile @@ -21,6 +21,8 @@ obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o +mq-deadline-y += mq-deadline-main.o +mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c new file mode 100644 index 000000000000..3b4bfddec39f --- /dev/null +++ b/block/mq-deadline-cgroup.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blk-cgroup.h> +#include <linux/ioprio.h> + +#include "mq-deadline-cgroup.h" + +static struct blkcg_policy dd_blkcg_policy; + +static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp) +{ + struct dd_blkcg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + pd->stats = alloc_percpu_gfp(typeof(*pd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!pd->stats) { + kfree(pd); + return NULL; + } + return &pd->cpd; +} + +static void dd_cpd_free(struct blkcg_policy_data *cpd) +{ + struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd); + + free_percpu(dd_blkcg->stats); + kfree(dd_blkcg); +} + +static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd) +{ + return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy), + struct dd_blkcg, cpd); +} + +/* + * Convert an association between a block cgroup and a request queue into a + * pointer to the mq-deadline information associated with a (blkcg, queue) pair. + */ +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd; + + pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy); + if (!pd) + return NULL; + + return dd_blkcg_from_pd(pd); +} + +static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + static const char *const prio_class_name[] = { + [IOPRIO_CLASS_NONE] = "NONE", + [IOPRIO_CLASS_RT] = "RT", + [IOPRIO_CLASS_BE] = "BE", + [IOPRIO_CLASS_IDLE] = "IDLE", + }; + struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); + int res = 0; + u8 prio; + + for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) + res += scnprintf(buf + res, size - res, + " [%s] dispatched=%u inserted=%u merged=%u", + prio_class_name[prio], + ddcg_sum(blkcg, dispatched, prio) + + ddcg_sum(blkcg, merged, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, inserted, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, merged, prio)); + + return res; +} + +static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + struct dd_blkg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + return &pd->pd; +} + +static void dd_pd_free(struct blkg_policy_data *pd) +{ + struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd); + + kfree(dd_blkg); +} + +static struct blkcg_policy dd_blkcg_policy = { + .cpd_alloc_fn = dd_cpd_alloc, + .cpd_free_fn = dd_cpd_free, + + .pd_alloc_fn = dd_pd_alloc, + .pd_free_fn = dd_pd_free, + .pd_stat_fn = dd_pd_stat, +}; + +int dd_activate_policy(struct request_queue *q) +{ + return blkcg_activate_policy(q, &dd_blkcg_policy); +} + +void dd_deactivate_policy(struct request_queue *q) +{ + blkcg_deactivate_policy(q, &dd_blkcg_policy); +} + +int __init dd_blkcg_init(void) +{ + return blkcg_policy_register(&dd_blkcg_policy); +} + +void __exit dd_blkcg_exit(void) +{ + blkcg_policy_unregister(&dd_blkcg_policy); +} diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h new file mode 100644 index 000000000000..0143fd74f3ce --- /dev/null +++ b/block/mq-deadline-cgroup.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(_MQ_DEADLINE_CGROUP_H_) +#define _MQ_DEADLINE_CGROUP_H_ + +#include <linux/blk-cgroup.h> + +struct request_queue; + +/** + * struct io_stats_per_prio - I/O statistics per I/O priority class. + * @inserted: Number of inserted requests. + * @merged: Number of merged requests. + * @dispatched: Number of dispatched requests. + * @completed: Number of I/O completions. + */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */ +struct blkcg_io_stats { + struct io_stats_per_prio stats[4]; +}; + +/** + * struct dd_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @stats: I/O statistics. + */ +struct dd_blkcg { + struct blkcg_policy_data cpd; /* must be the first member */ + struct blkcg_io_stats __percpu *stats; +}; + +/* + * Count one event of type 'event_type' and with I/O priority class + * 'prio_class'. + */ +#define ddcg_count(ddcg, event_type, prio_class) do { \ +if (ddcg) { \ + struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio_class), u8)); \ + local_inc(&io_stats->stats[(prio_class)].event_type); \ + put_cpu_ptr(io_stats); \ +} \ +} while (0) + +/* + * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls + * across all CPUs. No locking or barriers since it is fine if the returned + * sum is slightly outdated. + */ +#define ddcg_sum(ddcg, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio), u8)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +#ifdef CONFIG_BLK_CGROUP + +/** + * struct dd_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct dd_blkg { + struct blkg_policy_data pd; /* must be the first member */ +}; + +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio); +int dd_activate_policy(struct request_queue *q); +void dd_deactivate_policy(struct request_queue *q); +int __init dd_blkcg_init(void); +void __exit dd_blkcg_exit(void); + +#else /* CONFIG_BLK_CGROUP */ + +static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + return NULL; +} + +static inline int dd_activate_policy(struct request_queue *q) +{ + return 0; +} + +static inline void dd_deactivate_policy(struct request_queue *q) +{ +} + +static inline int dd_blkcg_init(void) +{ + return 0; +} + +static inline void dd_blkcg_exit(void) +{ +} + +#endif /* CONFIG_BLK_CGROUP */ + +#endif /* _MQ_DEADLINE_CGROUP_H_ */ diff --git a/block/mq-deadline.c b/block/mq-deadline-main.c similarity index 95% rename from block/mq-deadline.c rename to block/mq-deadline-main.c index 04d9d6b3745b..58a401ea8f56 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline-main.c @@ -25,6 +25,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "mq-deadline-cgroup.h" /* * See Documentation/block/deadline-iosched.rst @@ -51,14 +52,6 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; -/* I/O statistics per I/O priority. */ -struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - /* I/O statistics for all I/O priorities (enum dd_prio). */ struct io_stats { struct io_stats_per_prio stats[DD_PRIO_COUNT]; @@ -81,6 +74,9 @@ struct deadline_data { * run time data */ + /* Request queue that owns this data structure. */ + struct request_queue *queue; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -232,8 +228,10 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_blkcg *blkcg = next->elv.priv[0]; dd_count(dd, merged, prio); + ddcg_count(blkcg, merged, ioprio_class); /* * if next expires before rq, assign its expire time to rq @@ -370,6 +368,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; + struct dd_blkcg *blkcg; enum dd_prio prio; u8 ioprio_class; @@ -464,6 +463,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, dispatched, prio); + blkcg = rq->elv.priv[0]; + ddcg_count(blkcg, dispatched, ioprio_class); /* * If the request needs its target zone locked, do it. */ @@ -540,6 +541,8 @@ static void dd_exit_sched(struct elevator_queue *e) struct deadline_data *dd = e->elevator_data; enum dd_prio prio; + dd_deactivate_policy(dd->queue); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -553,7 +556,7 @@ static void dd_exit_sched(struct elevator_queue *e) } /* - * initialize elevator private data (deadline_data). + * Initialize elevator private data (deadline_data) and associate with blkcg. */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { @@ -562,6 +565,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) enum dd_prio prio; int ret = -ENOMEM; + /* + * Initialization would be very tricky if the queue is not frozen, + * hence the warning statement below. + */ + WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter)); + eq = elevator_alloc(q, e); if (!eq) return ret; @@ -577,6 +586,8 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) if (!dd->stats) goto free_dd; + dd->queue = q; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -595,9 +606,17 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); + ret = dd_activate_policy(q); + if (ret) + goto free_stats; + + ret = 0; q->elevator = eq; return 0; +free_stats: + free_percpu(dd->stats); + free_dd: kfree(dd); @@ -670,6 +689,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; + struct dd_blkcg *blkcg; lockdep_assert_held(&dd->lock); @@ -679,8 +699,19 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); + /* + * If a block cgroup has been associated with the submitter and if an + * I/O priority has been set in the associated block cgroup, use the + * lowest of the cgroup priority and the request priority for the + * request. If no priority has been set in the request, use the cgroup + * priority. + */ prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, inserted, prio); + blkcg = dd_blkcg_from_bio(rq->bio); + ddcg_count(blkcg, inserted, ioprio_class); + WARN_ON_ONCE(rq->elv.priv[0]); + rq->elv.priv[0] = blkcg; if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -727,12 +758,10 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ +/* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { + rq->elv.priv[0] = NULL; } /* @@ -755,11 +784,13 @@ static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; struct deadline_data *dd = q->elevator->elevator_data; + struct dd_blkcg *blkcg = rq->elv.priv[0]; const u8 ioprio_class = dd_rq_ioclass(rq); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; dd_count(dd, completed, prio); + ddcg_count(blkcg, completed, ioprio_class); if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -1079,11 +1110,26 @@ MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { - return elv_register(&mq_deadline); + int ret; + + ret = elv_register(&mq_deadline); + if (ret) + goto out; + ret = dd_blkcg_init(); + if (ret) + goto unreg; + +out: + return ret; + +unreg: + elv_unregister(&mq_deadline); + goto out; } static void __exit deadline_exit(void) { + dd_blkcg_exit(); elv_unregister(&mq_deadline); }