Re: [PATCH 1/3] blk-throttle: support io delay stats

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Ping...

Thanks,
Joseph

On 19/7/25 11:03, Joseph Qi wrote:
> Add blkio.throttle.io_service_time and blkio.throttle.io_wait_time to
> get per-cgroup io delay statistics in blk-throttle layer.
> io_service_time represents the time spent after io throttle to io
> completion, while io_wait_time represents the time spent on throttle
> queue.
> 
> Signed-off-by: Joseph Qi <joseph.qi@xxxxxxxxxxxxxxxxx>
> ---
>  block/bio.c               |   4 ++
>  block/blk-throttle.c      | 130 +++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/blk_types.h |  34 ++++++++++++
>  3 files changed, 167 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 299a0e7..3206462 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1826,6 +1826,10 @@ void bio_endio(struct bio *bio)
>  	blk_throtl_bio_endio(bio);
>  	/* release cgroup info */
>  	bio_uninit(bio);
> +#ifdef CONFIG_BLK_DEV_THROTTLING
> +	if (bio->bi_tg_end_io)
> +		bio->bi_tg_end_io(bio);
> +#endif
>  	if (bio->bi_end_io)
>  		bio->bi_end_io(bio);
>  }
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 8ab6c81..a5880f0 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -176,6 +176,11 @@ struct throtl_grp {
>  	unsigned int bio_cnt; /* total bios */
>  	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
>  	unsigned long bio_cnt_reset_time;
> +
> +	/* total time spent on lower layer: scheduler, device and others */
> +	struct blkg_rwstat service_time;
> +	/* total time spent on block throttle */
> +	struct blkg_rwstat wait_time;
>  };
>  
>  /* We measure latency for request size from <= 4k to >= 1M */
> @@ -487,6 +492,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
>  	if (!tg)
>  		return NULL;
>  
> +	if (blkg_rwstat_init(&tg->service_time, gfp) ||
> +	    blkg_rwstat_init(&tg->wait_time, gfp))
> +		goto err;
> +
>  	throtl_service_queue_init(&tg->service_queue);
>  
>  	for (rw = READ; rw <= WRITE; rw++) {
> @@ -511,6 +520,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
>  	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
>  
>  	return &tg->pd;
> +
> +err:
> +	blkg_rwstat_exit(&tg->service_time);
> +	blkg_rwstat_exit(&tg->wait_time);
> +	kfree(tg);
> +	return NULL;
>  }
>  
>  static void throtl_pd_init(struct blkg_policy_data *pd)
> @@ -592,6 +607,8 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
>  static void throtl_pd_offline(struct blkg_policy_data *pd)
>  {
>  	struct throtl_grp *tg = pd_to_tg(pd);
> +	struct blkcg_gq *blkg = pd_to_blkg(pd);
> +	struct blkcg_gq *parent = blkg->parent;
>  
>  	tg->bps[READ][LIMIT_LOW] = 0;
>  	tg->bps[WRITE][LIMIT_LOW] = 0;
> @@ -602,6 +619,12 @@ static void throtl_pd_offline(struct blkg_policy_data *pd)
>  
>  	if (!tg->td->limit_valid[tg->td->limit_index])
>  		throtl_upgrade_state(tg->td);
> +	if (parent) {
> +		blkg_rwstat_add_aux(&blkg_to_tg(parent)->service_time,
> +				    &tg->service_time);
> +		blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time,
> +				    &tg->wait_time);
> +	}
>  }
>  
>  static void throtl_pd_free(struct blkg_policy_data *pd)
> @@ -609,9 +632,19 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
>  	struct throtl_grp *tg = pd_to_tg(pd);
>  
>  	del_timer_sync(&tg->service_queue.pending_timer);
> +	blkg_rwstat_exit(&tg->service_time);
> +	blkg_rwstat_exit(&tg->wait_time);
>  	kfree(tg);
>  }
>  
> +static void throtl_pd_reset(struct blkg_policy_data *pd)
> +{
> +	struct throtl_grp *tg = pd_to_tg(pd);
> +
> +	blkg_rwstat_reset(&tg->service_time);
> +	blkg_rwstat_reset(&tg->wait_time);
> +}
> +
>  static struct throtl_grp *
>  throtl_rb_first(struct throtl_service_queue *parent_sq)
>  {
> @@ -1019,6 +1052,64 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
>  	return false;
>  }
>  
> +static void throtl_stats_update_completion(struct throtl_grp *tg,
> +					   uint64_t start_time,
> +					   uint64_t io_start_time,
> +					   int op)
> +{
> +	unsigned long flags;
> +	uint64_t now = sched_clock();
> +
> +	local_irq_save(flags);
> +	if (time_after64(now, io_start_time))
> +		blkg_rwstat_add(&tg->service_time, op, now - io_start_time);
> +	if (time_after64(io_start_time, start_time))
> +		blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time);
> +	local_irq_restore(flags);
> +}
> +
> +static void throtl_bio_end_io(struct bio *bio)
> +{
> +	struct throtl_grp *tg;
> +
> +	rcu_read_lock();
> +	/* see comments in throtl_bio_stats_start() */
> +	if (bio_flagged(bio, BIO_THROTL_STATED))
> +		goto out;
> +
> +	tg = (struct throtl_grp *)bio->bi_tg_private;
> +	if (!tg)
> +		goto out;
> +
> +	throtl_stats_update_completion(tg, bio_start_time_ns(bio),
> +				       bio_io_start_time_ns(bio),
> +				       bio_op(bio));
> +	blkg_put(tg_to_blkg(tg));
> +	bio_clear_flag(bio, BIO_THROTL_STATED);
> +out:
> +	rcu_read_unlock();
> +}
> +
> +static inline void throtl_bio_stats_start(struct bio *bio, struct throtl_grp *tg)
> +{
> +	int op = bio_op(bio);
> +
> +	/*
> +	 * It may happen that end_io will be called twice like dm-thin,
> +	 * which will save origin end_io first, and call its overwrite
> +	 * end_io and then the saved end_io. We use bio flag
> +	 * BIO_THROTL_STATED to do only once statistics.
> +	 */
> +	if ((op == REQ_OP_READ || op == REQ_OP_WRITE) &&
> +	    !bio_flagged(bio, BIO_THROTL_STATED)) {
> +		blkg_get(tg_to_blkg(tg));
> +		bio_set_flag(bio, BIO_THROTL_STATED);
> +		bio->bi_tg_end_io = throtl_bio_end_io;
> +		bio->bi_tg_private = tg;
> +		bio_set_start_time_ns(bio);
> +	}
> +}
> +
>  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
>  {
>  	bool rw = bio_data_dir(bio);
> @@ -1462,6 +1553,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
>  	return tg_set_conf(of, buf, nbytes, off, false);
>  }
>  
> +static u64 tg_prfill_rwstat_field(struct seq_file *sf,
> +				  struct blkg_policy_data *pd,
> +				  int off)
> +{
> +	struct throtl_grp *tg = pd_to_tg(pd);
> +	struct blkg_rwstat_sample rwstat = { };
> +
> +	blkg_rwstat_read((void *)tg + off, &rwstat);
> +	return __blkg_prfill_rwstat(sf, pd, &rwstat);
> +}
> +
> +static int tg_print_rwstat(struct seq_file *sf, void *v)
> +{
> +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
> +			  tg_prfill_rwstat_field, &blkcg_policy_throtl,
> +			  seq_cft(sf)->private, true);
> +	return 0;
> +}
> +
>  static struct cftype throtl_legacy_files[] = {
>  	{
>  		.name = "throttle.read_bps_device",
> @@ -1507,6 +1617,16 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
>  		.private = (unsigned long)&blkcg_policy_throtl,
>  		.seq_show = blkg_print_stat_ios_recursive,
>  	},
> +	{
> +		.name = "throttle.io_service_time",
> +		.private = offsetof(struct throtl_grp, service_time),
> +		.seq_show = tg_print_rwstat,
> +	},
> +	{
> +		.name = "throttle.io_wait_time",
> +		.private = offsetof(struct throtl_grp, wait_time),
> +		.seq_show = tg_print_rwstat,
> +	},
>  	{ }	/* terminate */
>  };
>  
> @@ -1732,6 +1852,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
>  	.pd_online_fn		= throtl_pd_online,
>  	.pd_offline_fn		= throtl_pd_offline,
>  	.pd_free_fn		= throtl_pd_free,
> +	.pd_reset_stats_fn	= throtl_pd_reset,
>  };
>  
>  static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
> @@ -2125,7 +2246,12 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
>  	WARN_ON_ONCE(!rcu_read_lock_held());
>  
>  	/* see throtl_charge_bio() */
> -	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
> +	if (bio_flagged(bio, BIO_THROTTLED))
> +		goto out;
> +
> +	throtl_bio_stats_start(bio, tg);
> +
> +	if (!tg->has_rules[rw])
>  		goto out;
>  
>  	spin_lock_irq(&q->queue_lock);
> @@ -2212,6 +2338,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
>  out_unlock:
>  	spin_unlock_irq(&q->queue_lock);
>  out:
> +	if (!throttled)
> +		bio_set_io_start_time_ns(bio);
>  	bio_set_flag(bio, BIO_THROTTLED);
>  
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index feff3fe..6906bc6 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -9,6 +9,7 @@
>  #include <linux/types.h>
>  #include <linux/bvec.h>
>  #include <linux/ktime.h>
> +#include <linux/sched/clock.h>
>  
>  struct bio_set;
>  struct bio;
> @@ -169,6 +170,12 @@ struct bio {
>  	 */
>  	struct blkcg_gq		*bi_blkg;
>  	struct bio_issue	bi_issue;
> +#ifdef CONFIG_BLK_DEV_THROTTLING
> +	unsigned long long	start_time_ns;	/* when passed to block throttle */
> +	unsigned long long	io_start_time_ns;	/* when no more throttle */
> +	bio_end_io_t		*bi_tg_end_io;
> +	void			*bi_tg_private;
> +#endif
>  #endif
>  	union {
>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
> @@ -218,6 +225,7 @@ enum {
>  				 * of this bio. */
>  	BIO_QUEUE_ENTERED,	/* can use blk_queue_enter_live() */
>  	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
> +	BIO_THROTL_STATED,	/* bio already stated */
>  	BIO_FLAG_LAST
>  };
>  
> @@ -248,6 +256,32 @@ enum {
>   */
>  #define BIO_RESET_BITS	BVEC_POOL_OFFSET
>  
> +#ifdef CONFIG_BLK_DEV_THROTTLING
> +static inline void bio_set_start_time_ns(struct bio *bio)
> +{
> +	preempt_disable();
> +	bio->start_time_ns = sched_clock();
> +	preempt_enable();
> +}
> +
> +static inline void bio_set_io_start_time_ns(struct bio *bio)
> +{
> +	preempt_disable();
> +	bio->io_start_time_ns = sched_clock();
> +	preempt_enable();
> +}
> +
> +static inline uint64_t bio_start_time_ns(struct bio *bio)
> +{
> +	return bio->start_time_ns;
> +}
> +
> +static inline uint64_t bio_io_start_time_ns(struct bio *bio)
> +{
> +	return bio->io_start_time_ns;
> +}
> +#endif
> +
>  typedef __u32 __bitwise blk_mq_req_flags_t;
>  
>  /*
> 



[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux