From: Yu Kuai <yukuai3@xxxxxxxxxx> wait_time is based on jiffies, and the calculation is: wait_time = extra_bytes * HZ / bps_limit; If wait time is not zero, the remainder is ignored, means wait time is short by at most 1 jiffes; On the other hand, if wait time is zero, it will be used as 1 jiffies, means it's excessive by at most 1 jiffes. This way causes blktests throtl/001 failure in case of CONFIG_HZ_100=y, fix the problem by recording remainder as debt, and repay the debt later. Reported-and-tested-by: Ming Lei <ming.lei@xxxxxxxxxx> Closes: https://lore.kernel.org/all/20250220111735.1187999-1-ming.lei@xxxxxxxxxx/ Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> --- block/blk-throttle.c | 24 +++++++++++++++++------- block/blk-throttle.h | 12 ++++++++---- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0096e486b1e3..3828c6535605 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -703,9 +703,10 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio } static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit) + u64 bps_limit, bool *has_debt) { bool rw = bio_data_dir(bio); + long long carryover_bytes; long long bytes_allowed; u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; @@ -730,10 +731,16 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); - - if (!jiffy_wait) - jiffy_wait = 1; + jiffy_wait = div64_u64_rem(extra_bytes * HZ, bps_limit, &carryover_bytes); + if (carryover_bytes) { + /* + * If extra_bytes is not divisible, the remainder is recorded as + * debt. Caller must ensure the current slice has at least 1 + * more jiffies to repay the debt. + */ + *has_debt = true; + tg->carryover_bytes[rw] -= div64_u64(carryover_bytes, HZ); + } /* * This wait time is without taking into consideration the rounding @@ -754,6 +761,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); + bool has_debt = false; /* * Currently whole state machine of group depends on first bio @@ -784,18 +792,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, else throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); - bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + bps_wait = tg_within_bps_limit(tg, bio, bps_limit, &has_debt); iops_wait = tg_within_iops_limit(tg, bio, iops_limit); if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; + if (has_debt) + throtl_extend_slice(tg, rw, jiffies + 1); return true; } max_wait = max(bps_wait, iops_wait); if (wait) *wait = max_wait; - throtl_extend_slice(tg, rw, jiffies + max_wait); + throtl_extend_slice(tg, rw, jiffies + max_wait + has_debt); return false; } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 1a36d1278eea..56dcb5ce412f 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -110,10 +110,14 @@ struct throtl_grp { unsigned int last_io_disp[2]; /* - * The following two fields are updated when new configuration is - * submitted while some bios are still throttled, they record how many - * bytes/ios are waited already in previous configuration, and they will - * be used to calculate wait time under new configuration. + * The following two fields are updated when: + * 1) new configuration is submitted while some bios are still + * throttled, they record how many bytes/ios are waited already in + * previous configuration; + * 2) IOs which may cause priority inversions are dispatched while tg is + * over limit, these IOs will be dispatched directly; + * 3) While calculating wait_time for IO, extra_bytes * HZ is not + * divisible by bps_limit, the remainder will be recorded; */ long long carryover_bytes[2]; int carryover_ios[2]; -- 2.39.2