Ping... Thanks, Joseph On 19/7/25 11:03, Joseph Qi wrote: > Add blkio.throttle.io_service_time and blkio.throttle.io_wait_time to > get per-cgroup io delay statistics in blk-throttle layer. > io_service_time represents the time spent after io throttle to io > completion, while io_wait_time represents the time spent on throttle > queue. > > Signed-off-by: Joseph Qi <joseph.qi@xxxxxxxxxxxxxxxxx> > --- > block/bio.c | 4 ++ > block/blk-throttle.c | 130 +++++++++++++++++++++++++++++++++++++++++++++- > include/linux/blk_types.h | 34 ++++++++++++ > 3 files changed, 167 insertions(+), 1 deletion(-) > > diff --git a/block/bio.c b/block/bio.c > index 299a0e7..3206462 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1826,6 +1826,10 @@ void bio_endio(struct bio *bio) > blk_throtl_bio_endio(bio); > /* release cgroup info */ > bio_uninit(bio); > +#ifdef CONFIG_BLK_DEV_THROTTLING > + if (bio->bi_tg_end_io) > + bio->bi_tg_end_io(bio); > +#endif > if (bio->bi_end_io) > bio->bi_end_io(bio); > } > diff --git a/block/blk-throttle.c b/block/blk-throttle.c > index 8ab6c81..a5880f0 100644 > --- a/block/blk-throttle.c > +++ b/block/blk-throttle.c > @@ -176,6 +176,11 @@ struct throtl_grp { > unsigned int bio_cnt; /* total bios */ > unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ > unsigned long bio_cnt_reset_time; > + > + /* total time spent on lower layer: scheduler, device and others */ > + struct blkg_rwstat service_time; > + /* total time spent on block throttle */ > + struct blkg_rwstat wait_time; > }; > > /* We measure latency for request size from <= 4k to >= 1M */ > @@ -487,6 +492,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) > if (!tg) > return NULL; > > + if (blkg_rwstat_init(&tg->service_time, gfp) || > + blkg_rwstat_init(&tg->wait_time, gfp)) > + goto err; > + > throtl_service_queue_init(&tg->service_queue); > > for (rw = READ; rw <= WRITE; rw++) { > @@ -511,6 +520,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) > tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; > > return &tg->pd; > + > +err: > + blkg_rwstat_exit(&tg->service_time); > + blkg_rwstat_exit(&tg->wait_time); > + kfree(tg); > + return NULL; > } > > static void throtl_pd_init(struct blkg_policy_data *pd) > @@ -592,6 +607,8 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) > static void throtl_pd_offline(struct blkg_policy_data *pd) > { > struct throtl_grp *tg = pd_to_tg(pd); > + struct blkcg_gq *blkg = pd_to_blkg(pd); > + struct blkcg_gq *parent = blkg->parent; > > tg->bps[READ][LIMIT_LOW] = 0; > tg->bps[WRITE][LIMIT_LOW] = 0; > @@ -602,6 +619,12 @@ static void throtl_pd_offline(struct blkg_policy_data *pd) > > if (!tg->td->limit_valid[tg->td->limit_index]) > throtl_upgrade_state(tg->td); > + if (parent) { > + blkg_rwstat_add_aux(&blkg_to_tg(parent)->service_time, > + &tg->service_time); > + blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time, > + &tg->wait_time); > + } > } > > static void throtl_pd_free(struct blkg_policy_data *pd) > @@ -609,9 +632,19 @@ static void throtl_pd_free(struct blkg_policy_data *pd) > struct throtl_grp *tg = pd_to_tg(pd); > > del_timer_sync(&tg->service_queue.pending_timer); > + blkg_rwstat_exit(&tg->service_time); > + blkg_rwstat_exit(&tg->wait_time); > kfree(tg); > } > > +static void throtl_pd_reset(struct blkg_policy_data *pd) > +{ > + struct throtl_grp *tg = pd_to_tg(pd); > + > + blkg_rwstat_reset(&tg->service_time); > + blkg_rwstat_reset(&tg->wait_time); > +} > + > static struct throtl_grp * > throtl_rb_first(struct throtl_service_queue *parent_sq) > { > @@ -1019,6 +1052,64 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, > return false; > } > > +static void throtl_stats_update_completion(struct throtl_grp *tg, > + uint64_t start_time, > + uint64_t io_start_time, > + int op) > +{ > + unsigned long flags; > + uint64_t now = sched_clock(); > + > + local_irq_save(flags); > + if (time_after64(now, io_start_time)) > + blkg_rwstat_add(&tg->service_time, op, now - io_start_time); > + if (time_after64(io_start_time, start_time)) > + blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time); > + local_irq_restore(flags); > +} > + > +static void throtl_bio_end_io(struct bio *bio) > +{ > + struct throtl_grp *tg; > + > + rcu_read_lock(); > + /* see comments in throtl_bio_stats_start() */ > + if (bio_flagged(bio, BIO_THROTL_STATED)) > + goto out; > + > + tg = (struct throtl_grp *)bio->bi_tg_private; > + if (!tg) > + goto out; > + > + throtl_stats_update_completion(tg, bio_start_time_ns(bio), > + bio_io_start_time_ns(bio), > + bio_op(bio)); > + blkg_put(tg_to_blkg(tg)); > + bio_clear_flag(bio, BIO_THROTL_STATED); > +out: > + rcu_read_unlock(); > +} > + > +static inline void throtl_bio_stats_start(struct bio *bio, struct throtl_grp *tg) > +{ > + int op = bio_op(bio); > + > + /* > + * It may happen that end_io will be called twice like dm-thin, > + * which will save origin end_io first, and call its overwrite > + * end_io and then the saved end_io. We use bio flag > + * BIO_THROTL_STATED to do only once statistics. > + */ > + if ((op == REQ_OP_READ || op == REQ_OP_WRITE) && > + !bio_flagged(bio, BIO_THROTL_STATED)) { > + blkg_get(tg_to_blkg(tg)); > + bio_set_flag(bio, BIO_THROTL_STATED); > + bio->bi_tg_end_io = throtl_bio_end_io; > + bio->bi_tg_private = tg; > + bio_set_start_time_ns(bio); > + } > +} > + > static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) > { > bool rw = bio_data_dir(bio); > @@ -1462,6 +1553,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, > return tg_set_conf(of, buf, nbytes, off, false); > } > > +static u64 tg_prfill_rwstat_field(struct seq_file *sf, > + struct blkg_policy_data *pd, > + int off) > +{ > + struct throtl_grp *tg = pd_to_tg(pd); > + struct blkg_rwstat_sample rwstat = { }; > + > + blkg_rwstat_read((void *)tg + off, &rwstat); > + return __blkg_prfill_rwstat(sf, pd, &rwstat); > +} > + > +static int tg_print_rwstat(struct seq_file *sf, void *v) > +{ > + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), > + tg_prfill_rwstat_field, &blkcg_policy_throtl, > + seq_cft(sf)->private, true); > + return 0; > +} > + > static struct cftype throtl_legacy_files[] = { > { > .name = "throttle.read_bps_device", > @@ -1507,6 +1617,16 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, > .private = (unsigned long)&blkcg_policy_throtl, > .seq_show = blkg_print_stat_ios_recursive, > }, > + { > + .name = "throttle.io_service_time", > + .private = offsetof(struct throtl_grp, service_time), > + .seq_show = tg_print_rwstat, > + }, > + { > + .name = "throttle.io_wait_time", > + .private = offsetof(struct throtl_grp, wait_time), > + .seq_show = tg_print_rwstat, > + }, > { } /* terminate */ > }; > > @@ -1732,6 +1852,7 @@ static void throtl_shutdown_wq(struct request_queue *q) > .pd_online_fn = throtl_pd_online, > .pd_offline_fn = throtl_pd_offline, > .pd_free_fn = throtl_pd_free, > + .pd_reset_stats_fn = throtl_pd_reset, > }; > > static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) > @@ -2125,7 +2246,12 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, > WARN_ON_ONCE(!rcu_read_lock_held()); > > /* see throtl_charge_bio() */ > - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) > + if (bio_flagged(bio, BIO_THROTTLED)) > + goto out; > + > + throtl_bio_stats_start(bio, tg); > + > + if (!tg->has_rules[rw]) > goto out; > > spin_lock_irq(&q->queue_lock); > @@ -2212,6 +2338,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, > out_unlock: > spin_unlock_irq(&q->queue_lock); > out: > + if (!throttled) > + bio_set_io_start_time_ns(bio); > bio_set_flag(bio, BIO_THROTTLED); > > #ifdef CONFIG_BLK_DEV_THROTTLING_LOW > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index feff3fe..6906bc6 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -9,6 +9,7 @@ > #include <linux/types.h> > #include <linux/bvec.h> > #include <linux/ktime.h> > +#include <linux/sched/clock.h> > > struct bio_set; > struct bio; > @@ -169,6 +170,12 @@ struct bio { > */ > struct blkcg_gq *bi_blkg; > struct bio_issue bi_issue; > +#ifdef CONFIG_BLK_DEV_THROTTLING > + unsigned long long start_time_ns; /* when passed to block throttle */ > + unsigned long long io_start_time_ns; /* when no more throttle */ > + bio_end_io_t *bi_tg_end_io; > + void *bi_tg_private; > +#endif > #endif > union { > #if defined(CONFIG_BLK_DEV_INTEGRITY) > @@ -218,6 +225,7 @@ enum { > * of this bio. */ > BIO_QUEUE_ENTERED, /* can use blk_queue_enter_live() */ > BIO_TRACKED, /* set if bio goes through the rq_qos path */ > + BIO_THROTL_STATED, /* bio already stated */ > BIO_FLAG_LAST > }; > > @@ -248,6 +256,32 @@ enum { > */ > #define BIO_RESET_BITS BVEC_POOL_OFFSET > > +#ifdef CONFIG_BLK_DEV_THROTTLING > +static inline void bio_set_start_time_ns(struct bio *bio) > +{ > + preempt_disable(); > + bio->start_time_ns = sched_clock(); > + preempt_enable(); > +} > + > +static inline void bio_set_io_start_time_ns(struct bio *bio) > +{ > + preempt_disable(); > + bio->io_start_time_ns = sched_clock(); > + preempt_enable(); > +} > + > +static inline uint64_t bio_start_time_ns(struct bio *bio) > +{ > + return bio->start_time_ns; > +} > + > +static inline uint64_t bio_io_start_time_ns(struct bio *bio) > +{ > + return bio->io_start_time_ns; > +} > +#endif > + > typedef __u32 __bitwise blk_mq_req_flags_t; > > /* >