Hi Jens, On 2023-02-16 05:08, Jens Axboe wrote: > I think your numbers are skewed because brd isn't flagg nowait, can you > try with this? > > I ran some quick testing here, using the current tree: > > without patch with patch > io_uring ~430K IOPS ~3.4M IOPS > libaio ~895K IOPS ~895K IOPS > > which is a pretty substantial difference... > I rebased my blk-mq changes on top of your nowait patches, but still I see a regression with blk-mq. When I tried to trace and run perf, nothing odd stood out, except for the normal blk-mq overhead. Could you try it in your setup and see if you are noticing a similar trend? Because based on the numbers you shared yesterday, I didn't see this regression. fio script I run to benchmark: $ fio --name=<workload> --rw=<workload> --ramp_time=5s --size=1G --io_size=10G --loop=4 --cpus_allowed=1 --filename=/dev/ram0 --direct=1 --iodepth=128 --ioengine=<engine> +-----------+-----------+--------+--------+ | io_uring | bio(base) | blk-mq | delta | +-----------+-----------+--------+--------+ | read | 577 | 446 | -22.7 | | randread | 504 | 416 | -17.46 | | write | 554 | 424 | -23.47 | | randwrite | 484 | 381 | -21.28 | +-----------+-----------+--------+--------+ +-----------+-----------+--------+--------+ | libaio | bio(base) | blk-mq | delta | +-----------+-----------+--------+--------+ | read | 412 | 341 | -17.23 | | randread | 389 | 335 | -13.88 | | write | 401 | 329 | -17.96 | | randwrite | 351 | 304 | -13.39 | +-----------+-----------+--------+--------+ My rebased blk-mq diff: diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 34177f1bd97d..726c4b94c7b6 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -16,6 +16,7 @@ #include <linux/major.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/pagemap.h> @@ -46,6 +47,7 @@ struct brd_device { spinlock_t brd_lock; struct radix_tree_root brd_pages; u64 brd_nr_pages; + struct blk_mq_tag_set tag_set; }; /* @@ -284,40 +286,48 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page, return err; } -static void brd_submit_bio(struct bio *bio) +static blk_status_t brd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; - sector_t sector = bio->bi_iter.bi_sector; + struct request *rq = bd->rq; + struct brd_device *brd = hctx->queue->queuedata; + sector_t sector = blk_rq_pos(rq); struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; + blk_status_t err = BLK_STS_OK; - bio_for_each_segment(bvec, bio, iter) { + blk_mq_start_request(bd->rq); + rq_for_each_segment(bvec, rq, iter) { unsigned int len = bvec.bv_len; - int err; + int ret; /* Don't support un-aligned buffer */ WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || - (len & (SECTOR_SIZE - 1))); + (len & (SECTOR_SIZE - 1))); - err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, - bio->bi_opf, sector); - if (err) { - if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) { - bio_wouldblock_error(bio); - return; - } - bio_io_error(bio); - return; + ret = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, + rq->cmd_flags, sector); + if (ret) { + if (ret == -ENOMEM && rq->cmd_flags & REQ_NOWAIT) + err = BLK_STS_AGAIN; + else + err = BLK_STS_IOERR; + goto end_request; } sector += len >> SECTOR_SHIFT; } - bio_endio(bio); +end_request: + blk_mq_end_request(bd->rq, err); + return BLK_STS_OK; } +static const struct blk_mq_ops brd_mq_ops = { + .queue_rq = brd_queue_rq, +}; + static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, - .submit_bio = brd_submit_bio, }; /* @@ -361,7 +371,7 @@ static int brd_alloc(int i) struct brd_device *brd; struct gendisk *disk; char buf[DISK_NAME_LEN]; - int err = -ENOMEM; + int err = 0; list_for_each_entry(brd, &brd_devices, brd_list) if (brd->brd_number == i) @@ -370,6 +380,15 @@ static int brd_alloc(int i) if (!brd) return -ENOMEM; brd->brd_number = i; + brd->tag_set.ops = &brd_mq_ops; + brd->tag_set.queue_depth = 128; + brd->tag_set.numa_node = NUMA_NO_NODE; + brd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; + brd->tag_set.cmd_size = 0; + brd->tag_set.driver_data = brd; + brd->tag_set.nr_hw_queues = 1; + list_add_tail(&brd->brd_list, &brd_devices); spin_lock_init(&brd->brd_lock); @@ -380,9 +399,17 @@ static int brd_alloc(int i) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); - disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + err = blk_mq_alloc_tag_set(&brd->tag_set); + if (err) { + err = -ENOMEM; goto out_free_dev; + } + + disk = brd->brd_disk = blk_mq_alloc_disk(&brd->tag_set, brd); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); + goto out_free_tags; + } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; @@ -414,6 +441,8 @@ static int brd_alloc(int i) out_cleanup_disk: put_disk(disk); +out_free_tags: + blk_mq_free_tag_set(&brd->tag_set); out_free_dev: list_del(&brd->brd_list); kfree(brd); -- 2.39.1