Re: [PATCH 0/1] improve brd performance with blk-mq

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Jens,

On 2023-02-16 05:08, Jens Axboe wrote:

> I think your numbers are skewed because brd isn't flagg nowait, can you
> try with this?
> 
> I ran some quick testing here, using the current tree:
> 
> 		without patch		with patch
> io_uring	~430K IOPS		~3.4M IOPS
> libaio		~895K IOPS		~895K IOPS
> 
> which is a pretty substantial difference...
> 

I rebased my blk-mq changes on top of your nowait patches, but still I see a
regression with blk-mq. When I tried to trace and run perf, nothing odd
stood out, except for the normal blk-mq overhead.

Could you try it in your setup and see if you are noticing a similar trend?
Because based on the numbers you shared yesterday, I didn't see this regression.

fio script I run to benchmark:

$ fio --name=<workload>  --rw=<workload>  --ramp_time=5s --size=1G
--io_size=10G --loop=4 --cpus_allowed=1 --filename=/dev/ram0 --direct=1
--iodepth=128 --ioengine=<engine>

+-----------+-----------+--------+--------+
| io_uring  | bio(base) | blk-mq | delta  |
+-----------+-----------+--------+--------+
|   read    |    577    |  446   | -22.7  |
| randread  |    504    |  416   | -17.46 |
|   write   |    554    |  424   | -23.47 |
| randwrite |    484    |  381   | -21.28 |
+-----------+-----------+--------+--------+

+-----------+-----------+--------+--------+
|  libaio   | bio(base) | blk-mq | delta  |
+-----------+-----------+--------+--------+
|   read    |    412    |  341   | -17.23 |
| randread  |    389    |  335   | -13.88 |
|   write   |    401    |  329   | -17.96 |
| randwrite |    351    |  304   | -13.39 |
+-----------+-----------+--------+--------+

My rebased blk-mq diff:

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 34177f1bd97d..726c4b94c7b6 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -16,6 +16,7 @@
 #include <linux/major.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/blk-mq.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
@@ -46,6 +47,7 @@ struct brd_device {
 	spinlock_t		brd_lock;
 	struct radix_tree_root	brd_pages;
 	u64			brd_nr_pages;
+	struct blk_mq_tag_set tag_set;
 };

 /*
@@ -284,40 +286,48 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 	return err;
 }

-static void brd_submit_bio(struct bio *bio)
+static blk_status_t brd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
-	struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
-	sector_t sector = bio->bi_iter.bi_sector;
+	struct request *rq = bd->rq;
+	struct brd_device *brd = hctx->queue->queuedata;
+	sector_t sector = blk_rq_pos(rq);
 	struct bio_vec bvec;
-	struct bvec_iter iter;
+	struct req_iterator iter;
+	blk_status_t err = BLK_STS_OK;

-	bio_for_each_segment(bvec, bio, iter) {
+	blk_mq_start_request(bd->rq);
+	rq_for_each_segment(bvec, rq, iter) {
 		unsigned int len = bvec.bv_len;
-		int err;
+		int ret;

 		/* Don't support un-aligned buffer */
 		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
-				(len & (SECTOR_SIZE - 1)));
+			     (len & (SECTOR_SIZE - 1)));

-		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
-				  bio->bi_opf, sector);
-		if (err) {
-			if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
-				bio_wouldblock_error(bio);
-				return;
-			}
-			bio_io_error(bio);
-			return;
+		ret = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
+				  rq->cmd_flags, sector);
+		if (ret) {
+			if (ret == -ENOMEM && rq->cmd_flags & REQ_NOWAIT)
+				err = BLK_STS_AGAIN;
+			else
+				err = BLK_STS_IOERR;
+			goto end_request;
 		}
 		sector += len >> SECTOR_SHIFT;
 	}

-	bio_endio(bio);
+end_request:
+	blk_mq_end_request(bd->rq, err);
+	return BLK_STS_OK;
 }

+static const struct blk_mq_ops brd_mq_ops = {
+	.queue_rq = brd_queue_rq,
+};
+
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
-	.submit_bio =		brd_submit_bio,
 };

 /*
@@ -361,7 +371,7 @@ static int brd_alloc(int i)
 	struct brd_device *brd;
 	struct gendisk *disk;
 	char buf[DISK_NAME_LEN];
-	int err = -ENOMEM;
+	int err = 0;

 	list_for_each_entry(brd, &brd_devices, brd_list)
 		if (brd->brd_number == i)
@@ -370,6 +380,15 @@ static int brd_alloc(int i)
 	if (!brd)
 		return -ENOMEM;
 	brd->brd_number		= i;
+	brd->tag_set.ops = &brd_mq_ops;
+	brd->tag_set.queue_depth = 128;
+	brd->tag_set.numa_node = NUMA_NO_NODE;
+	brd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING |
+			     BLK_MQ_F_NO_SCHED_BY_DEFAULT;
+	brd->tag_set.cmd_size = 0;
+	brd->tag_set.driver_data = brd;
+	brd->tag_set.nr_hw_queues = 1;
+
 	list_add_tail(&brd->brd_list, &brd_devices);

 	spin_lock_init(&brd->brd_lock);
@@ -380,9 +399,17 @@ static int brd_alloc(int i)
 		debugfs_create_u64(buf, 0444, brd_debugfs_dir,
 				&brd->brd_nr_pages);

-	disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
-	if (!disk)
+	err = blk_mq_alloc_tag_set(&brd->tag_set);
+	if (err) {
+		err = -ENOMEM;
 		goto out_free_dev;
+	}
+
+	disk = brd->brd_disk = blk_mq_alloc_disk(&brd->tag_set, brd);
+	if (IS_ERR(disk)) {
+		err = PTR_ERR(disk);
+		goto out_free_tags;
+	}

 	disk->major		= RAMDISK_MAJOR;
 	disk->first_minor	= i * max_part;
@@ -414,6 +441,8 @@ static int brd_alloc(int i)

 out_cleanup_disk:
 	put_disk(disk);
+out_free_tags:
+	blk_mq_free_tag_set(&brd->tag_set);
 out_free_dev:
 	list_del(&brd->brd_list);
 	kfree(brd);
-- 
2.39.1



[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux