We know total bandwidth of a disk and can calculate cgroup's bandwidth percentage against disk bandwidth according to its weight. We can easily calculate cgroup bandwidth. Signed-off-by: Shaohua Li <shli@xxxxxx> --- block/blk-throttle.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 156 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 90f937f..fafe9c2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2,6 +2,7 @@ * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal <vgoyal@xxxxxxxxxx> + * Proportional throttle - Shaohua Li <shli@xxxxxxxxxx> */ #include <linux/module.h> @@ -12,6 +13,12 @@ #include <linux/blk-cgroup.h> #include "blk.h" +#define MAX_WEIGHT (10000) +#define MIN_WEIGHT (1) +#define DFT_WEIGHT (100) +#define SHARE_SHIFT (14) +#define MAX_SHARE (1 << SHARE_SHIFT) + /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -74,6 +81,10 @@ struct throtl_service_queue { unsigned int nr_pending; /* # queued in the tree */ unsigned long first_pending_disptime; /* disptime of the first tg */ struct timer_list pending_timer; /* fires on first_pending_disptime */ + + unsigned int weight; /* this queue's weight against siblings */ + unsigned int children_weight; /* children weight */ + unsigned int share; /* disk bandwidth share of the queue */ }; enum tg_state_flags { @@ -139,6 +150,22 @@ struct throtl_grp { unsigned long slice_end[2]; }; +enum run_mode { + MODE_NONE = 0, + MODE_THROTTLE = 1, /* bandwidth/iops based throttle */ + /* below are weight based */ + MODE_WEIGHT_BANDWIDTH = 2, + MODE_WEIGHT_IOPS = 3, + MAX_MODE = 4, +}; + +static char *run_mode_name[MAX_MODE] = { + [MODE_NONE] = "none", + [MODE_THROTTLE] = "throttle", + [MODE_WEIGHT_BANDWIDTH] = "weight_bw", + [MODE_WEIGHT_IOPS] = "weight_iops", +}; + struct throtl_data { /* service tree for active throtl groups */ @@ -156,8 +183,14 @@ struct throtl_data /* Work for dispatching throttled bios */ struct work_struct dispatch_work; + enum run_mode mode; }; +static bool td_weight_based(struct throtl_data *td) +{ + return td->mode > MODE_THROTTLE; +} + static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -209,9 +242,33 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) static inline int tg_data_index(struct throtl_grp *tg, bool rw) { + if (td_weight_based(tg->td)) + return 0; return rw; } +static inline uint64_t queue_bandwidth(struct throtl_data *td) +{ + uint64_t bw; + + bw = td->queue->disk_bw * 512; + /* can't estimate bandwidth, can't do proporation control */ + if (bw == 0) + bw = -1; + return bw; +} + +static inline uint64_t queue_iops(struct throtl_data *td) +{ + uint64_t iops; + + iops = td->queue->disk_iops; + /* can't estimate iops, can't do proporation control */ + if (iops == 0) + iops = -1; + return iops; +} + /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported @@ -386,6 +443,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd) sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; + sq->weight = DFT_WEIGHT; + sq->parent_sq->children_weight += sq->weight; tg->td = td; } @@ -406,7 +465,8 @@ static void tg_update_has_rules(struct throtl_grp *tg) for (i = READ; i <= WRITE; i++) tg->has_rules[i] = (parent_tg && parent_tg->has_rules[i]) || - io_cost_has_limit(tg, i); + io_cost_has_limit(tg, i) || + (td_weight_based(tg->td)); } static void throtl_pd_online(struct blkg_policy_data *pd) @@ -421,6 +481,10 @@ static void throtl_pd_online(struct blkg_policy_data *pd) static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); + struct throtl_service_queue *sq = &tg->service_queue; + + if (sq->parent_sq) + sq->parent_sq->children_weight -= sq->weight; del_timer_sync(&tg->service_queue.pending_timer); kfree(tg); @@ -812,6 +876,10 @@ static bool io_cost_with_in_limit(struct throtl_grp *tg, struct bio *bio, { unsigned long bps_wait = 0, iops_wait = 0; + if (tg->td->mode == MODE_WEIGHT_BANDWIDTH) + return io_cost_with_in_bps_limit(tg, bio, wait); + if (tg->td->mode == MODE_WEIGHT_IOPS) + return io_cost_with_in_iops_limit(tg, bio, wait); if (io_cost_with_in_bps_limit(tg, bio, &bps_wait) && io_cost_with_in_iops_limit(tg, bio, &iops_wait)) { *wait = 0; @@ -967,6 +1035,77 @@ static void start_parent_slice_with_credit(struct throtl_grp *child_tg, } +static void tg_update_perf(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq; + u64 new_bps, abs_bps = 0; + unsigned int new_iops, abs_iops = 0; + + sq = &tg->service_queue; + + /* '/' cgroup in cgroup2 */ + if (!td_weight_based(tg->td) || !sq->parent_sq || + (cgroup_subsys_on_dfl(io_cgrp_subsys) && !tg_to_blkg(tg)->parent)) + return; + + if (tg->td->mode == MODE_WEIGHT_BANDWIDTH) { + new_bps = max_t(uint64_t, + (queue_bandwidth(tg->td) * sq->share) >> SHARE_SHIFT, + 1024); + if (new_bps > tg->io_cost.bps[0]) + abs_bps = new_bps - tg->io_cost.bps[0]; + if (new_bps < tg->io_cost.bps[0]) + abs_bps = tg->io_cost.bps[0] - new_bps; + if (abs_bps > (tg->io_cost.bps[0] >> 3)) + throtl_start_new_slice(tg, 0); + tg->io_cost.bps[0] = new_bps; + tg->io_cost.iops[0] = -1; + } else { + new_iops = max_t(uint64_t, + (queue_iops(tg->td) * sq->share) >> SHARE_SHIFT, + 1); + if (new_iops > tg->io_cost.iops[0]) + abs_iops = new_iops - tg->io_cost.iops[0]; + if (new_iops < tg->io_cost.iops[0]) + abs_iops = tg->io_cost.iops[0] - new_iops; + if (abs_iops > (tg->io_cost.iops[0] >> 3)) + throtl_start_new_slice(tg, 0); + tg->io_cost.iops[0] = new_iops; + tg->io_cost.bps[0] = -1; + } +} + +/* update share of tg's siblings */ +static void tg_update_share(struct throtl_data *td, struct throtl_grp *tg) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg, *parent_blkg; + struct throtl_grp *child; + + if (!td_weight_based(td)) + return; + if (!tg || !tg->service_queue.parent_sq || + !tg->service_queue.parent_sq->parent_sq) + parent_blkg = td->queue->root_blkg; + else + parent_blkg = tg_to_blkg(sq_to_tg(tg->service_queue.parent_sq)); + + blkg_for_each_descendant_pre(blkg, pos_css, parent_blkg) { + struct throtl_service_queue *sq; + + child = blkg_to_tg(blkg); + sq = &child->service_queue; + + if (!sq->parent_sq) + continue; + + sq->share = max_t(unsigned int, + sq->parent_sq->share * sq->weight / + sq->parent_sq->children_weight, + 1); + } +} + static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; @@ -1014,11 +1153,18 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; - unsigned int max_nr_reads = throtl_grp_quantum*3/4; - unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; + unsigned int max_nr_reads; + unsigned int max_nr_writes; struct bio *bio; - /* Try to dispatch 75% READS and 25% WRITES */ + if (td_weight_based(tg->td)) { + max_nr_reads = throtl_grp_quantum; + max_nr_writes = 0; + } else { + /* Try to dispatch 75% READS and 25% WRITES */ + max_nr_reads = throtl_grp_quantum * 3 / 4; + max_nr_writes = throtl_grp_quantum - max_nr_reads; + } while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { @@ -1039,6 +1185,9 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) if (nr_writes >= max_nr_writes) break; } + if (nr_reads + nr_writes) { + tg_update_perf(tg); + } return nr_reads + nr_writes; } @@ -1494,6 +1643,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[index]) break; + tg_update_perf(tg); /* if above limits, break to queue */ if (!tg_may_dispatch(tg, bio, NULL)) @@ -1639,6 +1789,8 @@ int blk_throtl_init(struct request_queue *q) INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); + td->service_queue.share = MAX_SHARE; + td->mode = MODE_NONE; q->td = td; td->queue = q; -- 2.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html