[PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

Gulam Mohamed <gulam.mohamed@xxxxxxxxxx> · Wed, 21 Dec 2022 04:05:06 +0000

Problem Desc
============
The "iostat" user-space utility was showing %util as 100% for the disks
which has latencies less than a milli-second i.e for latencies in the
range of micro-seconds and below.

Root Cause
==========
The IO accounting in block layer is currently done by updating the
io_ticks in jiffies which is of milli-seconds granularity. Due to this,
for the devices with IO latencies less than a milli-second, the latency
will be accounted as 1 milli-second even-though its in the range of
micro-seconds. This was causing the iostat command to show %util
as 100% which is incorrect.

Recreationg of the issue
========================
Setup
-----
Devices: NVMe 24 devices
Model number: 4610 (Intel)

fio
---
[global]
bs=4K
iodepth=1
direct=1
ioengine=libaio
group_reporting
time_based
runtime=100
thinktime=1ms
numjobs=1
name=raw-write
rw=randrw
ignore_error=EIO:EIO
[job1]
filename=/dev/nvme0n1

iostat o/p
----------

Device   %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
nvme3n1   0.00    0.05    0.00  75.38     0.50     0.00   0.00 100.00

Device   %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
nvme3n1   0.00    0.05    0.00  74.74     0.50     0.00   0.00 100.00

Solution
========
Use ktime_get_ns() to update the disk_stats io_ticks so that the io_ticks
are updated for every io start and end times.

Issues using ktime
==================

Using ktime_get_ns() has a performance overhead as ktime will go ahead
and reads the clock everytime its called. Following test environment
was used by Jens Axboe on which t/io_uring was run which has shown a
high performance drop.

Devices
-------
SSDs: P5800X
No of devices: 24

io_uring config
---------------
Polled IO
iostats: Enabled
Reads: Random
Block Size: 512
QDepth: 128
Batch Submit: 32
Batch Complete: 32
No of Threads: 24

With the above environment and ktime patch, it has shown a performance
drop of ~25% from iostats disabled and ~19% performance drop from current
iostats enabled. This performance drop is high.

Suggestion from Jens Axboe
==========================
Jens Axboe suggested to split the bigger patch into two as follows:

1. In first patch, change all the types from unsigned long to u64, that
   can be done while retaining jiffies.

2. In second patch, add an iostats == 2 setting, which enables the higher
   resolution mode using ktime. We'd still default to 1, lower granularity
   iostats enabled.

Fix details
===========
1. Use ktime_get_ns() to get the current nano-seconds to update the
   io_ticks for start and end time stamps in block layer for io accounting

2. Create a new setting '2' in sysfs for iostats variable i.e for
   /sys/block/<device-name>/queue/iostats, to enable the iostat (io
   accounting) with nano-seconds (using ktime) granularity. This setting
   should be enabled only if the iostat is needed with high resolution
   mode as it has a high performance drop

3. Earlier available settings were 0 and 1 for disable and enable io
   accounting with milli-seconds granularity (jiffies)

Testing
=======
Ran the t/io_uring command with following setup:

Devices
-------
SSDs: P4610
No of devices: 8

io_uring config
---------------
Polled IO
iostats: Enabled
Reads: Random
Block Size: 512
QDepth: 128
Batch Submit: 32
Batch Complete: 32
No of Threads: 24

io_uring o/p
------------
iostat=0, with patch: Maximum IOPS=10.09M
iostat=1, with patch: Maximum IOPS=9.84M
iostat=2, with patch: Maximum IOPS=9.48M

Changes from V2 to V3
=====================
1. Changed all the required variables data-type to u64 as a first patch
2. Create a new setting '2' for iostats in sysfs in this patch
3. Change the code to get the ktime values when iostat=2, in this patch

Signed-off-by: Gulam Mohamed <gulam.mohamed@xxxxxxxxxx>
---
 block/blk-core.c                  | 26 +++++++++++++++++----
 block/blk-mq.c                    |  4 ++--
 block/blk-sysfs.c                 | 39 ++++++++++++++++++++++++++++++-
 block/genhd.c                     | 18 ++++++++++----
 drivers/block/drbd/drbd_debugfs.c | 12 ++++++++--
 drivers/block/zram/zram_drv.c     |  3 ++-
 drivers/md/dm.c                   | 13 +++++++++--
 include/linux/blkdev.h            |  4 ++++
 8 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5670032fe932..0b5e4eb909a5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -927,6 +927,18 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
 }
 EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
 
+/*
+ * Get the time based upon the available granularity for io accounting
+ * If the resolution mode is set to precise (2) i.e
+ * (/sys/block/<device>/queue/iostats = 2), then this will return time
+ * in nano-seconds else this will return time in jiffies
+ */
+u64  blk_get_iostat_ticks(struct request_queue *q)
+{
+       return (blk_queue_precise_io_stat(q) ? ktime_get_ns() : jiffies);
+}
+EXPORT_SYMBOL_GPL(blk_get_iostat_ticks);
+
 void update_io_ticks(struct block_device *part, u64 now, bool end)
 {
 	u64 stamp;
@@ -968,20 +980,26 @@ EXPORT_SYMBOL(bdev_start_io_acct);
 u64 bio_start_io_acct(struct bio *bio)
 {
 	return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
-				  bio_op(bio), jiffies);
+				  bio_op(bio),
+				  blk_get_iostat_ticks(bio->bi_bdev->bd_queue));
 }
 EXPORT_SYMBOL_GPL(bio_start_io_acct);
 
 void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
 		      u64 start_time)
 {
+	u64 now;
+	u64 duration;
+	struct request_queue *q = bdev_get_queue(bdev);
 	const int sgrp = op_stat_group(op);
-	u64 now = READ_ONCE(jiffies);
-	u64 duration = (unsigned long)now -(unsigned long) start_time;
+	now = blk_get_iostat_ticks(q);;
+	duration = (unsigned long)now -(unsigned long) start_time;
 
 	part_stat_lock();
 	update_io_ticks(bdev, now, true);
-	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
+	part_stat_add(bdev, nsecs[sgrp],
+		(blk_queue_precise_io_stat(q) ? duration :
+		 jiffies_to_nsecs(duration)));
 	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
 	part_stat_unlock();
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e6b3ccd4989..5318bf87f17e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -975,7 +975,7 @@ static void __blk_account_io_done(struct request *req, u64 now)
 	const int sgrp = op_stat_group(req_op(req));
 
 	part_stat_lock();
-	update_io_ticks(req->part, jiffies, true);
+	update_io_ticks(req->part, blk_get_iostat_ticks(req->q), true);
 	part_stat_inc(req->part, ios[sgrp]);
 	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
 	part_stat_unlock();
@@ -1007,7 +1007,7 @@ static void __blk_account_io_start(struct request *rq)
 		rq->part = rq->q->disk->part0;
 
 	part_stat_lock();
-	update_io_ticks(rq->part, jiffies, false);
+	update_io_ticks(rq->part, blk_get_iostat_ticks(rq->q), false);
 	part_stat_unlock();
 }
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 93d9e9c9a6ea..e7959782ce59 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -307,7 +307,6 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \
 
 QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
 QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
-QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
 QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
@@ -363,6 +362,44 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_iostats_show(struct request_queue *q, char *page)
+{
+       bool iostat = blk_queue_io_stat(q);
+       bool precise_iostat = blk_queue_precise_io_stat(q);
+
+       return queue_var_show(iostat << precise_iostat, page);
+}
+
+static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
+                                  size_t count)
+{
+       unsigned long val;
+       ssize_t ret = -EINVAL;
+
+       ret = queue_var_store(&val, page, count);
+
+       if (ret < 0)
+               return ret;
+
+       if (val == 2) {
+               blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
+               blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q);
+               q->disk->part0->bd_stamp = 0;
+       }
+       else if (val == 1) {
+               blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
+               blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q);
+               q->disk->part0->bd_stamp = 0;
+       }
+       else if (val == 0) {
+               blk_queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
+               blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q);
+               q->disk->part0->bd_stamp = 0;
+       }
+
+       return ret;
+}
+
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
 	bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
diff --git a/block/genhd.c b/block/genhd.c
index 03a96d6473e1..d034219a4683 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -951,6 +951,7 @@ ssize_t part_stat_show(struct device *dev,
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct disk_stats stat;
 	unsigned int inflight;
+	u64 stat_ioticks;
 
 	if (queue_is_mq(q))
 		inflight = blk_mq_in_flight(q, bdev);
@@ -959,10 +960,13 @@ ssize_t part_stat_show(struct device *dev,
 
 	if (inflight) {
 		part_stat_lock();
-		update_io_ticks(bdev, jiffies, true);
+		update_io_ticks(bdev, blk_get_iostat_ticks(q), true);
 		part_stat_unlock();
 	}
 	part_stat_read_all(bdev, &stat);
+	stat_ioticks = (blk_queue_precise_io_stat(q) ?
+				div_u64(stat.io_ticks, NSEC_PER_MSEC) :
+				jiffies_to_msecs(stat.io_ticks));
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -979,7 +983,7 @@ ssize_t part_stat_show(struct device *dev,
 		(unsigned long long)stat.sectors[STAT_WRITE],
 		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
 		inflight,
-		jiffies_to_msecs(stat.io_ticks),
+		(unsigned int)stat_ioticks,
 		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
 				      stat.nsecs[STAT_WRITE] +
 				      stat.nsecs[STAT_DISCARD] +
@@ -1217,6 +1221,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	unsigned int inflight;
 	struct disk_stats stat;
 	unsigned long idx;
+	struct request_queue *q;
+	u64 stat_ioticks;
 
 	/*
 	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1235,12 +1241,16 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		else
 			inflight = part_in_flight(hd);
 
+		q = bdev_get_queue(hd);
 		if (inflight) {
 			part_stat_lock();
-			update_io_ticks(hd, jiffies, true);
+			update_io_ticks(hd, blk_get_iostat_ticks(q), true);
 			part_stat_unlock();
 		}
 		part_stat_read_all(hd, &stat);
+		stat_ioticks = (blk_queue_precise_io_stat(q) ?
+				div_u64(stat.io_ticks, NSEC_PER_MSEC) :
+				jiffies_to_msecs(stat.io_ticks));
 		seq_printf(seqf, "%4d %7d %pg "
 			   "%lu %lu %lu %u "
 			   "%lu %lu %lu %u "
@@ -1260,7 +1270,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
 							NSEC_PER_MSEC),
 			   inflight,
-			   jiffies_to_msecs(stat.io_ticks),
+			   (unsigned int)stat_ioticks,
 			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
 						 stat.nsecs[STAT_WRITE] +
 						 stat.nsecs[STAT_DISCARD] +
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index a72c096aa5b1..af193bcc4f3a 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -97,6 +97,12 @@ static void seq_print_one_request(struct seq_file *m, struct drbd_request *req,
 {
 	/* change anything here, fixup header below! */
 	unsigned int s = req->rq_state;
+	unsigned long start_time;
+	struct request_queue *q = req->device->rq_queue;
+
+	start_time = (blk_queue_precise_io_stat(q) ?
+			nsecs_to_jiffies(req->start_jif) :
+			req->start_jif);
 
 #define RQ_HDR_1 "epoch\tsector\tsize\trw"
 	seq_printf(m, "0x%x\t%llu\t%u\t%s",
@@ -105,7 +111,7 @@ static void seq_print_one_request(struct seq_file *m, struct drbd_request *req,
 		(s & RQ_WRITE) ? "W" : "R");
 
 #define RQ_HDR_2 "\tstart\tin AL\tsubmit"
-	seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+	seq_printf(m, "\t%d", jiffies_to_msecs(now - start_time));
 	seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
 	seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
 
@@ -171,7 +177,9 @@ static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *r
 			/* if the oldest request does not wait for the activity log
 			 * it is not interesting for us here */
 			if (req && !(req->rq_state & RQ_IN_ACT_LOG))
-				jif = req->start_jif;
+				jif = (blk_queue_precise_io_stat(device->rq_queue) ?
+						nsecs_to_jiffies(req->start_jif):
+						req->start_jif);
 			else
 				req = NULL;
 			spin_unlock_irq(&device->resource->req_lock);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index da28eb83e6ed..8e22487de7de 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1663,6 +1663,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	struct zram *zram;
 	struct bio_vec bv;
 	u64 start_time;
+	struct request_queue *q = bdev_get_queue(bdev);
 
 	if (PageTransHuge(page))
 		return -ENOTSUPP;
@@ -1682,7 +1683,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_offset = 0;
 
 	start_time = bdev_start_io_acct(bdev->bd_disk->part0,
-			SECTORS_PER_PAGE, op, jiffies);
+			SECTORS_PER_PAGE, op, blk_get_iostat_ticks(q));
 	ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
 	bdev_end_io_acct(bdev->bd_disk->part0, op, start_time);
 out:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 011a85ea40da..1bb58a0b8cd1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -482,7 +482,11 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 
 u64 dm_start_time_ns_from_clone(struct bio *bio)
 {
-	return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	u64 start_time_ns = (blk_queue_precise_io_stat(q) ?
+				clone_to_tio(bio)->io->start_time :
+				jiffies_to_nsecs(clone_to_tio(bio)->io->start_time));
+	return start_time_ns;
 }
 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 
@@ -498,6 +502,11 @@ static void dm_io_acct(struct dm_io *io, bool end)
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->orig_bio;
 	unsigned int sectors;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+	u64 start_time = (blk_queue_precise_io_stat(q) ?
+				nsecs_to_jiffies(io->start_time) :
+				io->start_time);
 
 	/*
 	 * If REQ_PREFLUSH set, don't account payload, it will be
@@ -589,7 +598,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	io->orig_bio = bio;
 	io->md = md;
 	spin_lock_init(&io->lock);
-	io->start_time = jiffies;
+	io->start_time = blk_get_iostat_ticks(bio->bi_bdev->bd_queue);
 	io->flags = 0;
 
 	if (static_branch_unlikely(&stats_enabled))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca94d690d292..0543a536c6e5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -570,6 +570,7 @@ struct request_queue {
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
 #define QUEUE_FLAG_SQ_SCHED     30	/* single queue style io dispatch */
 #define QUEUE_FLAG_SKIP_TAGSET_QUIESCE	31 /* quiesce_tagset skip the queue*/
+#define QUEUE_FLAG_PRECISE_IO_STAT 32   /* To enable precise io accounting */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1UL << QUEUE_FLAG_IO_STAT) |		\
 				 (1UL << QUEUE_FLAG_SAME_COMP) |	\
@@ -578,6 +579,7 @@ struct request_queue {
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
+u64  blk_get_iostat_ticks(struct request_queue *q);
 
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
@@ -589,6 +591,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_stable_writes(q) \
 	test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
+#define blk_queue_precise_io_stat(q)   \
+	test_bit(QUEUE_FLAG_PRECISE_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_zone_resetall(q)	\
 	test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
-- 
2.31.1