Add blkio.throttle.io_service_time and blkio.throttle.io_wait_time to get per-cgroup io delay statistics in blk-throttle layer. io_service_time represents the time spent after io throttle to io completion, while io_wait_time represents the time spent on throttle queue. Signed-off-by: Joseph Qi <joseph.qi@xxxxxxxxxxxxxxxxx> --- block/bio.c | 4 ++ block/blk-throttle.c | 130 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/blk_types.h | 34 ++++++++++++ 3 files changed, 167 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 299a0e7..3206462 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1826,6 +1826,10 @@ void bio_endio(struct bio *bio) blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); +#ifdef CONFIG_BLK_DEV_THROTTLING + if (bio->bi_tg_end_io) + bio->bi_tg_end_io(bio); +#endif if (bio->bi_end_io) bio->bi_end_io(bio); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8ab6c81..a5880f0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -176,6 +176,11 @@ struct throtl_grp { unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + + /* total time spent on lower layer: scheduler, device and others */ + struct blkg_rwstat service_time; + /* total time spent on block throttle */ + struct blkg_rwstat wait_time; }; /* We measure latency for request size from <= 4k to >= 1M */ @@ -487,6 +492,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) if (!tg) return NULL; + if (blkg_rwstat_init(&tg->service_time, gfp) || + blkg_rwstat_init(&tg->wait_time, gfp)) + goto err; + throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { @@ -511,6 +520,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; + +err: + blkg_rwstat_exit(&tg->service_time); + blkg_rwstat_exit(&tg->wait_time); + kfree(tg); + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -592,6 +607,8 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) static void throtl_pd_offline(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct blkcg_gq *parent = blkg->parent; tg->bps[READ][LIMIT_LOW] = 0; tg->bps[WRITE][LIMIT_LOW] = 0; @@ -602,6 +619,12 @@ static void throtl_pd_offline(struct blkg_policy_data *pd) if (!tg->td->limit_valid[tg->td->limit_index]) throtl_upgrade_state(tg->td); + if (parent) { + blkg_rwstat_add_aux(&blkg_to_tg(parent)->service_time, + &tg->service_time); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time, + &tg->wait_time); + } } static void throtl_pd_free(struct blkg_policy_data *pd) @@ -609,9 +632,19 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->service_time); + blkg_rwstat_exit(&tg->wait_time); kfree(tg); } +static void throtl_pd_reset(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + blkg_rwstat_reset(&tg->service_time); + blkg_rwstat_reset(&tg->wait_time); +} + static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -1019,6 +1052,64 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return false; } +static void throtl_stats_update_completion(struct throtl_grp *tg, + uint64_t start_time, + uint64_t io_start_time, + int op) +{ + unsigned long flags; + uint64_t now = sched_clock(); + + local_irq_save(flags); + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&tg->service_time, op, now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time); + local_irq_restore(flags); +} + +static void throtl_bio_end_io(struct bio *bio) +{ + struct throtl_grp *tg; + + rcu_read_lock(); + /* see comments in throtl_bio_stats_start() */ + if (bio_flagged(bio, BIO_THROTL_STATED)) + goto out; + + tg = (struct throtl_grp *)bio->bi_tg_private; + if (!tg) + goto out; + + throtl_stats_update_completion(tg, bio_start_time_ns(bio), + bio_io_start_time_ns(bio), + bio_op(bio)); + blkg_put(tg_to_blkg(tg)); + bio_clear_flag(bio, BIO_THROTL_STATED); +out: + rcu_read_unlock(); +} + +static inline void throtl_bio_stats_start(struct bio *bio, struct throtl_grp *tg) +{ + int op = bio_op(bio); + + /* + * It may happen that end_io will be called twice like dm-thin, + * which will save origin end_io first, and call its overwrite + * end_io and then the saved end_io. We use bio flag + * BIO_THROTL_STATED to do only once statistics. + */ + if ((op == REQ_OP_READ || op == REQ_OP_WRITE) && + !bio_flagged(bio, BIO_THROTL_STATED)) { + blkg_get(tg_to_blkg(tg)); + bio_set_flag(bio, BIO_THROTL_STATED); + bio->bi_tg_end_io = throtl_bio_end_io; + bio->bi_tg_private = tg; + bio_set_start_time_ns(bio); + } +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -1462,6 +1553,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } +static u64 tg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + struct blkg_rwstat_sample rwstat = { }; + + blkg_rwstat_read((void *)tg + off, &rwstat); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_field, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", @@ -1507,6 +1617,16 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, .private = (unsigned long)&blkcg_policy_throtl, .seq_show = blkg_print_stat_ios_recursive, }, + { + .name = "throttle.io_service_time", + .private = offsetof(struct throtl_grp, service_time), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.io_wait_time", + .private = offsetof(struct throtl_grp, wait_time), + .seq_show = tg_print_rwstat, + }, { } /* terminate */ }; @@ -1732,6 +1852,7 @@ static void throtl_shutdown_wq(struct request_queue *q) .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, + .pd_reset_stats_fn = throtl_pd_reset, }; static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) @@ -2125,7 +2246,12 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED)) + goto out; + + throtl_bio_stats_start(bio, tg); + + if (!tg->has_rules[rw]) goto out; spin_lock_irq(&q->queue_lock); @@ -2212,6 +2338,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, out_unlock: spin_unlock_irq(&q->queue_lock); out: + if (!throttled) + bio_set_io_start_time_ns(bio); bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index feff3fe..6906bc6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/bvec.h> #include <linux/ktime.h> +#include <linux/sched/clock.h> struct bio_set; struct bio; @@ -169,6 +170,12 @@ struct bio { */ struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; +#ifdef CONFIG_BLK_DEV_THROTTLING + unsigned long long start_time_ns; /* when passed to block throttle */ + unsigned long long io_start_time_ns; /* when no more throttle */ + bio_end_io_t *bi_tg_end_io; + void *bi_tg_private; +#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -218,6 +225,7 @@ enum { * of this bio. */ BIO_QUEUE_ENTERED, /* can use blk_queue_enter_live() */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_THROTL_STATED, /* bio already stated */ BIO_FLAG_LAST }; @@ -248,6 +256,32 @@ enum { */ #define BIO_RESET_BITS BVEC_POOL_OFFSET +#ifdef CONFIG_BLK_DEV_THROTTLING +static inline void bio_set_start_time_ns(struct bio *bio) +{ + preempt_disable(); + bio->start_time_ns = sched_clock(); + preempt_enable(); +} + +static inline void bio_set_io_start_time_ns(struct bio *bio) +{ + preempt_disable(); + bio->io_start_time_ns = sched_clock(); + preempt_enable(); +} + +static inline uint64_t bio_start_time_ns(struct bio *bio) +{ + return bio->start_time_ns; +} + +static inline uint64_t bio_io_start_time_ns(struct bio *bio) +{ + return bio->io_start_time_ns; +} +#endif + typedef __u32 __bitwise blk_mq_req_flags_t; /* -- 1.8.3.1