Device mapper was converted to percpu inflight counters. In order to display the correct values in the "inflight" sysfs file and in /proc/diskstats, we need a custom callback that sums the percpu counters. The function part_round_stats calculates the number of in-flight I/Os every jiffy and uses this to calculate the counters time_in_queue and io_ticks. In order to avoid excessive memory traffic on systems with high number of CPUs, this functionality is disabled when percpu inflight values are used and the values time_in_queue and io_ticks are calculated differently - the result is less precise. We add the duration of an I/O to time_in_queue when the I/O finishes (the value is almost the same as previously, except for the time of in-flight I/Os). If an I/O starts or finishes and the "jiffies" value has changed, we add one to io_ticks. If the I/Os take less than a jiffy, the value is as exact as the previous value. If the I/Os take more than a jiffy, the value may lag behind the previous value. Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx> --- block/blk-core.c | 7 ++++++- block/blk-settings.c | 6 ++++++ block/genhd.c | 12 ++++++++++++ drivers/md/dm.c | 37 +++++++++++++++++++++++++++++++++++-- include/linux/blkdev.h | 3 +++ 5 files changed, 62 insertions(+), 3 deletions(-) Index: linux-dm/block/genhd.c =================================================================== --- linux-dm.orig/block/genhd.c 2018-11-15 22:11:51.000000000 +0100 +++ linux-dm/block/genhd.c 2018-11-15 22:11:51.000000000 +0100 @@ -68,6 +68,13 @@ void part_dec_in_flight(struct request_q void part_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { + if (q->get_inflight_fn) { + q->get_inflight_fn(q, inflight); + inflight[0] += inflight[1]; + inflight[1] = 0; + return; + } + if (q->mq_ops) { blk_mq_in_flight(q, part, inflight); return; @@ -85,6 +92,11 @@ void part_in_flight(struct request_queue void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { + if (q->get_inflight_fn) { + q->get_inflight_fn(q, inflight); + return; + } + if (q->mq_ops) { blk_mq_in_flight_rw(q, part, inflight); return; Index: linux-dm/include/linux/blkdev.h =================================================================== --- linux-dm.orig/include/linux/blkdev.h 2018-11-15 22:11:51.000000000 +0100 +++ linux-dm/include/linux/blkdev.h 2018-11-15 22:11:51.000000000 +0100 @@ -286,6 +286,7 @@ struct blk_queue_ctx; typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); +typedef void (get_inflight_fn)(struct request_queue *, unsigned int [2]); struct bio_vec; typedef int (dma_drain_needed_fn)(struct request *); @@ -405,6 +406,7 @@ struct request_queue { make_request_fn *make_request_fn; poll_q_fn *poll_fn; dma_drain_needed_fn *dma_drain_needed; + get_inflight_fn *get_inflight_fn; const struct blk_mq_ops *mq_ops; @@ -1099,6 +1101,7 @@ extern void blk_queue_update_dma_alignme extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +extern void blk_queue_get_inflight(struct request_queue *, get_inflight_fn *); /* * Number of physical segments as sent to the device. Index: linux-dm/block/blk-settings.c =================================================================== --- linux-dm.orig/block/blk-settings.c 2018-11-15 22:11:51.000000000 +0100 +++ linux-dm/block/blk-settings.c 2018-11-15 22:11:51.000000000 +0100 @@ -849,6 +849,12 @@ void blk_queue_write_cache(struct reques } EXPORT_SYMBOL_GPL(blk_queue_write_cache); +void blk_queue_get_inflight(struct request_queue *q, get_inflight_fn *fn) +{ + q->get_inflight_fn = fn; +} +EXPORT_SYMBOL_GPL(blk_queue_get_inflight); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; Index: linux-dm/drivers/md/dm.c =================================================================== --- linux-dm.orig/drivers/md/dm.c 2018-11-15 22:11:51.000000000 +0100 +++ linux-dm/drivers/md/dm.c 2018-11-15 22:18:44.000000000 +0100 @@ -657,18 +657,30 @@ int md_in_flight(struct mapped_device *m return (int)sum; } +static void test_io_ticks(int cpu, struct hd_struct *part, unsigned long now) +{ + unsigned long stamp = READ_ONCE(part->stamp); + if (unlikely(stamp != now)) { + if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { + __part_stat_add(cpu, part, io_ticks, 1); + } + } +} + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; + unsigned long now = jiffies; struct hd_struct *part; int sgrp, cpu; - io->start_time = jiffies; + io->start_time = now; part = &dm_disk(md)->part0; sgrp = op_stat_group(bio_op(bio)); cpu = part_stat_lock(); + test_io_ticks(cpu, part, now); __part_stat_add(cpu, part, ios[sgrp], 1); __part_stat_add(cpu, part, sectors[sgrp], bio_sectors(bio)); part_stat_unlock(); @@ -685,7 +697,8 @@ static void end_io_acct(struct dm_io *io { struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - unsigned long duration = jiffies - io->start_time; + unsigned long now = jiffies; + unsigned long duration = now - io->start_time; struct hd_struct *part; int sgrp, cpu; @@ -697,7 +710,9 @@ static void end_io_acct(struct dm_io *io part = &dm_disk(md)->part0; sgrp = op_stat_group(bio_op(bio)); cpu = part_stat_lock(); + test_io_ticks(cpu, part, now); __part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); + __part_stat_add(cpu, part, time_in_queue, duration); part_stat_unlock(); smp_wmb(); @@ -711,6 +726,23 @@ static void end_io_acct(struct dm_io *io } } +static void dm_get_inflight(struct request_queue *q, unsigned int inflight[2]) +{ + struct mapped_device *md = q->queuedata; + int cpu; + + inflight[READ] = inflight[WRITE] = 0; + for_each_possible_cpu(cpu) { + struct dm_percpu *p = per_cpu_ptr(md->counters, cpu); + inflight[READ] += p->inflight[READ]; + inflight[WRITE] += p->inflight[WRITE]; + } + if ((int)inflight[READ] < 0) + inflight[READ] = 0; + if ((int)inflight[WRITE] < 0) + inflight[WRITE] = 0; +} + /* * Add the bio to the list of deferred io. */ @@ -2224,6 +2256,7 @@ int dm_setup_md_queue(struct mapped_devi case DM_TYPE_NVME_BIO_BASED: dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); + blk_queue_get_inflight(md->queue, dm_get_inflight); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); Index: linux-dm/block/blk-core.c =================================================================== --- linux-dm.orig/block/blk-core.c 2018-11-15 22:11:51.000000000 +0100 +++ linux-dm/block/blk-core.c 2018-11-15 22:11:51.000000000 +0100 @@ -695,10 +695,15 @@ static void part_round_stats_single(stru void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) { struct hd_struct *part2 = NULL; - unsigned long now = jiffies; + unsigned long now; unsigned int inflight[2]; int stats = 0; + if (q->get_inflight_fn) + return; + + now = jiffies; + if (part->stamp != now) stats |= 1;