Patch "block: support to account io_ticks precisely" has been added to the 6.6-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sun, 26 May 2024 15:33:43 -0400

This is a note to let you know that I've just added the patch titled

    block: support to account io_ticks precisely

to the 6.6-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     block-support-to-account-io_ticks-precisely.patch
and it can be found in the queue-6.6 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 61644276c0ea74bd3d523afff2f74971306ef6ef
Author: Yu Kuai <yukuai3@xxxxxxxxxx>
Date:   Thu May 9 20:37:16 2024 +0800

    block: support to account io_ticks precisely
    
    [ Upstream commit 99dc422335d8b2bd4d105797241d3e715bae90e9 ]
    
    Currently, io_ticks is accounted based on sampling, specifically
    update_io_ticks() will always account io_ticks by 1 jiffies from
    bdev_start_io_acct()/blk_account_io_start(), and the result can be
    inaccurate, for example(HZ is 250):
    
    Test script:
    fio -filename=/dev/sda -bs=4k -rw=write -direct=1 -name=test -thinktime=4ms
    
    Test result: util is about 90%, while the disk is really idle.
    
    This behaviour is introduced by commit 5b18b5a73760 ("block: delete
    part_round_stats and switch to less precise counting"), however, there
    was a key point that is missed that this patch also improve performance
    a lot:
    
    Before the commit:
    part_round_stats:
      if (part->stamp != now)
       stats |= 1;
    
      part_in_flight()
      -> there can be lots of task here in 1 jiffies.
      part_round_stats_single()
       __part_stat_add()
      part->stamp = now;
    
    After the commit:
    update_io_ticks:
      stamp = part->bd_stamp;
      if (time_after(now, stamp))
       if (try_cmpxchg())
        __part_stat_add()
        -> only one task can reach here in 1 jiffies.
    
    Hence in order to account io_ticks precisely, we only need to know if
    there are IO inflight at most once in one jiffies. Noted that for
    rq-based device, iterating tags should not be used here because
    'tags->lock' is grabbed in blk_mq_find_and_get_req(), hence
    part_stat_lock_inc/dec() and part_in_flight() is used to trace inflight.
    The additional overhead is quite little:
    
     - per cpu add/dec for each IO for rq-based device;
     - per cpu sum for each jiffies;
    
    And it's verified by null-blk that there are no performance degration
    under heavy IO pressure.
    
    Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
    Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
    Link: https://lore.kernel.org/r/20240509123717.3223892-2-yukuai1@xxxxxxxxxxxxxxx
    Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/block/blk-core.c b/block/blk-core.c
index a3726d8cf8738..bf058cea9016a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -950,10 +950,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
 	unsigned long stamp;
 again:
 	stamp = READ_ONCE(part->bd_stamp);
-	if (unlikely(time_after(now, stamp))) {
-		if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
-			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
-	}
+	if (unlikely(time_after(now, stamp)) &&
+	    likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+	    (end || part_in_flight(part)))
+		__part_stat_add(part, io_ticks, now - stamp);
+
 	if (part->bd_partno) {
 		part = bdev_whole(part);
 		goto again;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 65e75efa9bd36..07bf758c523a9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -783,6 +783,8 @@ static void blk_account_io_merge_request(struct request *req)
 	if (blk_do_io_stat(req)) {
 		part_stat_lock();
 		part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+		part_stat_local_dec(req->part,
+				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 257b0addd47e5..4c91889affa7c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -994,6 +994,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
 		update_io_ticks(req->part, jiffies, true);
 		part_stat_inc(req->part, ios[sgrp]);
 		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+		part_stat_local_dec(req->part,
+				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
 	}
 }
@@ -1016,6 +1018,8 @@ static inline void blk_account_io_start(struct request *req)
 
 		part_stat_lock();
 		update_io_ticks(req->part, jiffies, false);
+		part_stat_local_inc(req->part,
+				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk.h b/block/blk.h
index 08a358bc0919e..67915b04b3c17 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -344,6 +344,7 @@ static inline bool blk_do_io_stat(struct request *rq)
 }
 
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+unsigned int part_in_flight(struct block_device *part);
 
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 {
diff --git a/block/genhd.c b/block/genhd.c
index 2ef1e08d70ecd..33b1ebf6ef82d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part,
 	}
 }
 
-static unsigned int part_in_flight(struct block_device *part)
+unsigned int part_in_flight(struct block_device *part)
 {
 	unsigned int inflight = 0;
 	int cpu;