weight based blk-throttling can use the estimated performance to calculate cgroup bandwidth/IOPS. That said we never can't get accurate disk performance in a complex workload, a roughly estimation is enough. One issue is workload might not send enough IO to make disk 100% busy. The calculation should compensate disk utilization. The calculation equation is: bps = (bytes * HZ / time) * (time / busy_time) = bytes * HZ / busy_time iops = (ios * HZ / time) * (time / busy_time) = ios * HZ / busy_time Signed-off-by: Shaohua Li <shli@xxxxxx> --- block/blk-core.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-sysfs.c | 13 ++++++++++++ include/linux/blkdev.h | 7 +++++++ 3 files changed, 76 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index ab51685..4244f28 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1919,6 +1919,59 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; } +#define UPDATE_TIME (HZ / 2) +static void blk_update_perf(struct request_queue *q, + struct hd_struct *p) +{ + unsigned long now = jiffies; + unsigned long last = q->bw_timestamp; + sector_t read_sect, write_sect, tmp_sect; + unsigned long read_ios, write_ios, tmp_ios; + unsigned long current_ticks; + unsigned int busy_ticks; + + if (time_before(now, last + UPDATE_TIME)) + return; + + if (cmpxchg(&q->bw_timestamp, last, now) != last) + return; + + tmp_sect = part_stat_read(p, sectors[READ]); + read_sect = tmp_sect - q->last_sects[READ]; + q->last_sects[READ] = tmp_sect; + tmp_sect = part_stat_read(p, sectors[WRITE]); + write_sect = tmp_sect - q->last_sects[WRITE]; + q->last_sects[WRITE] = tmp_sect; + + tmp_ios = part_stat_read(p, ios[READ]); + read_ios = tmp_ios - q->last_ios[READ]; + q->last_ios[READ] = tmp_ios; + tmp_ios = part_stat_read(p, ios[WRITE]); + write_ios = tmp_ios - q->last_ios[WRITE]; + q->last_ios[WRITE] = tmp_ios; + + current_ticks = part_stat_read(p, io_ticks); + busy_ticks = current_ticks - q->last_ticks; + q->last_ticks = current_ticks; + + /* Don't account for long idle */ + if (now - last > UPDATE_TIME * 2) + return; + /* Disk load is too low or driver doesn't account io_ticks */ + if (busy_ticks == 0) + return; + + if (busy_ticks > now - last) + busy_ticks = now - last; + + tmp_sect = (read_sect + write_sect) * HZ; + sector_div(tmp_sect, busy_ticks); + q->disk_bw = tmp_sect; + + tmp_ios = (read_ios + write_ios) * HZ / busy_ticks; + q->disk_iops = tmp_ios; +} + static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { @@ -1991,6 +2044,9 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); + blk_update_perf(q, + part->partno ? &part_to_disk(part)->part0 : part); + if (!blkcg_bio_issue_check(q, bio)) return false; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e140cc4..b59461c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -348,6 +348,13 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_avg_perf_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu %llu\n", + (unsigned long long)q->disk_bw * 512, + (unsigned long long)q->disk_iops); +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -479,6 +486,11 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_avg_perf_entry = { + .attr = {.name = "average_perf", .mode = S_IRUGO }, + .show = queue_avg_perf_show, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -504,6 +516,7 @@ static struct attribute *default_attrs[] = { &queue_iostats_entry.attr, &queue_random_entry.attr, &queue_poll_entry.attr, + &queue_avg_perf_entry.attr, NULL, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 29189ae..d2d6a7b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -466,6 +466,13 @@ struct request_queue { struct bio_set *bio_split; bool mq_sysfs_init_done; + + unsigned long bw_timestamp; + unsigned long last_ticks; + sector_t last_sects[2]; + unsigned long last_ios[2]; + sector_t disk_bw; + unsigned long disk_iops; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- 2.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html