[RFC PATCH 1/2] bdev: add support for CPU latency PM QoS tuning

Tero Kristo <tero.kristo@xxxxxxxxxxxxxxx> · Thu, 29 Aug 2024 10:18:19 +0300

Add support for limiting CPU latency while block IO is running. When a
block IO is started, it will add a user configurable CPU latency limit
in place (if any.) The limit is removed with a configurable timeout mechanism.

Signed-off-by: Tero Kristo <tero.kristo@xxxxxxxxxxxxxxx>
---
 block/bdev.c              | 51 +++++++++++++++++++++++++++++++++++++++
 block/bio.c               |  2 ++
 block/blk.h               |  2 ++
 include/linux/blk_types.h | 12 +++++++++
 4 files changed, 67 insertions(+)

diff --git a/block/bdev.c b/block/bdev.c
index 353677ac49b3..8f20681a4ea6 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -405,10 +405,18 @@ void __init bdev_cache_init(void)
 	blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
 }
 
+static void bdev_pm_qos_work(struct work_struct *work)
+{
+	struct bdev_cpu_latency_qos *qos =
+		container_of(work, struct bdev_cpu_latency_qos, work.work);
+	dev_pm_qos_remove_request(&qos->req);
+}
+
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 {
 	struct block_device *bdev;
 	struct inode *inode;
+	int cpu;
 
 	inode = new_inode(blockdev_superblock);
 	if (!inode)
@@ -433,6 +441,16 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+	bdev->bd_pm_qos = alloc_percpu(struct bdev_cpu_latency_qos);
+	if (!bdev->bd_pm_qos) {
+		free_percpu(bdev->bd_stats);
+		iput(inode);
+		return NULL;
+	}
+	for_each_possible_cpu(cpu)
+		INIT_DELAYED_WORK(per_cpu_ptr(&bdev->bd_pm_qos->work, cpu),
+				  bdev_pm_qos_work);
+	bdev->cpu_lat_limit = -1;
 	return bdev;
 }
 
@@ -462,6 +480,19 @@ void bdev_unhash(struct block_device *bdev)
 
 void bdev_drop(struct block_device *bdev)
 {
+	int cpu;
+	struct bdev_cpu_latency_qos *qos;
+
+	for_each_possible_cpu(cpu) {
+		qos = per_cpu_ptr(bdev->bd_pm_qos, cpu);
+		if (dev_pm_qos_request_active(&qos->req)) {
+			cancel_delayed_work(&qos->work);
+			dev_pm_qos_remove_request(&qos->req);
+		}
+	}
+
+	free_percpu(bdev->bd_pm_qos);
+
 	iput(BD_INODE(bdev));
 }
 
@@ -1281,6 +1312,26 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 	blkdev_put_no_open(bdev);
 }
 
+void bdev_update_cpu_latency_pm_qos(struct block_device *bdev)
+{
+	int cpu;
+	struct bdev_cpu_latency_qos *qos;
+
+	if (!bdev || bdev->cpu_lat_limit < 0)
+		return;
+
+	cpu = raw_smp_processor_id();
+	qos = per_cpu_ptr(bdev->bd_pm_qos, cpu);
+
+	if (!dev_pm_qos_request_active(&qos->req))
+		dev_pm_qos_add_request(get_cpu_device(cpu), &qos->req,
+				       DEV_PM_QOS_RESUME_LATENCY,
+				       bdev->cpu_lat_limit);
+
+	mod_delayed_work(system_wq, &qos->work,
+			 msecs_to_jiffies(bdev->cpu_lat_timeout));
+}
+
 bool disk_live(struct gendisk *disk)
 {
 	return !inode_unhashed(BD_INODE(disk->part0));
diff --git a/block/bio.c b/block/bio.c
index e9e809a63c59..6c46d75345d7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -282,6 +282,8 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	bio->bi_max_vecs = max_vecs;
 	bio->bi_io_vec = table;
 	bio->bi_pool = NULL;
+
+	bdev_update_cpu_latency_pm_qos(bio->bi_bdev);
 }
 EXPORT_SYMBOL(bio_init);
 
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..dda2a188984b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -516,6 +516,8 @@ void drop_partition(struct block_device *part);
 
 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
 
+void bdev_update_cpu_latency_pm_qos(struct block_device *bdev);
+
 struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 		struct lock_class_key *lkclass);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..0ed29603eaa9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -11,6 +11,7 @@
 #include <linux/device.h>
 #include <linux/ktime.h>
 #include <linux/rw_hint.h>
+#include <linux/pm_qos.h>
 
 struct bio_set;
 struct bio;
@@ -38,6 +39,11 @@ struct bio_crypt_ctx;
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 
+struct bdev_cpu_latency_qos {
+	struct dev_pm_qos_request	req;
+	struct delayed_work		work;
+};
+
 struct block_device {
 	sector_t		bd_start_sect;
 	sector_t		bd_nr_sectors;
@@ -71,6 +77,12 @@ struct block_device {
 
 	struct partition_meta_info *bd_meta_info;
 	int			bd_writers;
+
+	/* For preventing deep idle during block I/O */
+	struct bdev_cpu_latency_qos __percpu	*bd_pm_qos;
+	int cpu_lat_timeout;
+	int cpu_lat_limit;
+
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
-- 
2.43.1