On Wed, Aug 03, 2022 at 02:52:11PM -0600, Jens Axboe wrote: > I ran this on my test box to see how we'd do. First the bad news: > smaller block size IO seems slower. I ran with QD=8 and used 24 drives, > and using t/io_uring (with registered buffers, polled, etc) and a 512b > block size I get: > > IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1 > IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2 > IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1 > IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2 > IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1 > IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2 > > and adding -D1 I get: > > IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1 > IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1 > IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2 > IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1 > IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2 > IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1 > > which does regress that workload. Bummer, I would expect -D1 to be no worse. My test isn't nearly as consistent as yours, so I'm having some trouble measuring. I'm only coming with a few micro-optimizations that might help. A diff is below on top of this series. I also created a branch with everything folded in here: git://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git io_uring/dma-register https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/log/?h=io_uring/dma-register -- >8 -- diff --git a/block/bio.c b/block/bio.c index 3b7accae8996..c1e97dff5e40 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1154,7 +1154,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter) { size_t size = iov_iter_count(iter); @@ -1165,8 +1165,6 @@ static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter) bio->bi_opf |= REQ_NOMERGE; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_DMA_TAGGED); - - iov_iter_advance(iter, bio->bi_iter.bi_size); } static int bio_iov_add_page(struct bio *bio, struct page *page, @@ -1307,6 +1305,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (iov_iter_is_dma_tag(iter)) { bio_iov_dma_tag_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } diff --git a/block/fops.c b/block/fops.c index db2d1e848f4b..1b3649c7eb17 100644 --- a/block/fops.c +++ b/block/fops.c @@ -325,7 +325,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, * bio_iov_iter_get_pages() and set the bvec directly. */ bio_iov_bvec_set(bio, iter); - } else { + } else if (iov_iter_is_dma_tag(iter)) { + bio_iov_dma_tag_set(bio, iter); + }else { ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { bio_put(bio); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index dbf73ab0877e..511cae2b7ce9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -113,7 +113,8 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); struct nvme_dma_mapping { int nr_pages; u16 offset; - u8 rsvd[2]; + bool needs_sync; + u8 rsvd; dma_addr_t prp_dma_addr; __le64 *prps; }; @@ -556,16 +557,9 @@ static void nvme_sync_dma(struct nvme_dev *dev, struct request *req, struct nvme_dma_mapping *mapping) { int offset, i, j, length, nprps; - bool needs_sync; offset = blk_rq_dma_offset(req) + mapping->offset; i = offset >> NVME_CTRL_PAGE_SHIFT; - needs_sync = rq_data_dir(req) == READ && - dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i])); - - if (!needs_sync) - return; - offset = offset & (NVME_CTRL_PAGE_SIZE - 1); length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); @@ -643,7 +637,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); if (mapping) { - nvme_sync_dma(dev, req, mapping); + if (mapping->needs_sync) + nvme_sync_dma(dev, req, mapping); if (iod->npages >= 0) nvme_free_prp_chain(dev, req, iod); return; @@ -894,16 +889,13 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req, int i, offset, j, length, nprps, nprps_left; struct dma_pool *pool; __le64 *prp_list; - bool needs_sync; void **list; offset = blk_rq_dma_offset(req) + mapping->offset; i = offset >> NVME_CTRL_PAGE_SHIFT; offset = offset & (NVME_CTRL_PAGE_SIZE - 1); - needs_sync = rq_data_dir(req) == WRITE && - dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i])); - if (needs_sync) { + if (mapping->needs_sync) { dma_sync_single_for_device(dev->dev, le64_to_cpu(mapping->prps[i]), NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE); @@ -916,7 +908,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req, return BLK_STS_OK; if (length <= NVME_CTRL_PAGE_SIZE) { - if (needs_sync) + if (mapping->needs_sync) dma_sync_single_for_device(dev->dev, le64_to_cpu(mapping->prps[i]), NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); @@ -983,7 +975,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req, cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); sync: - if (!needs_sync) + if (!mapping->needs_sync) return BLK_STS_OK; i = offset >> NVME_CTRL_PAGE_SHIFT; @@ -1931,6 +1923,7 @@ static void *nvme_pci_dma_map(struct request_queue *q, if (!mapping->prps) goto free_mapping; + mapping->needs_sync = false; for (i = 0, k = 0; i < nr_vecs; i++) { struct bio_vec *bv = bvec + i; dma_addr_t dma_addr; @@ -1959,6 +1952,9 @@ static void *nvme_pci_dma_map(struct request_queue *q, if (i == 0) dma_addr -= mapping->offset; + if (dma_need_sync(dev->dev, dma_addr)) + mapping->needs_sync = true; + for (j = 0; j < ppv; j++) mapping->prps[k++] = cpu_to_le64(dma_addr + j * NVME_CTRL_PAGE_SIZE); diff --git a/include/linux/bio.h b/include/linux/bio.h index 649348bc03c2..b5277ec189e0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -474,6 +474,7 @@ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); +void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d370b45d7f1b..ebdf81473526 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1070,6 +1070,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); + } else if (iov_iter_is_dma_tag(i)) { + i->iov_offset += size; + i->count -= size; } else if (iov_iter_is_pipe(i)) { pipe_advance(i, size); } else if (unlikely(iov_iter_is_xarray(i))) { @@ -1077,9 +1080,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->count -= size; } else if (iov_iter_is_discard(i)) { i->count -= size; - } else if (iov_iter_is_dma_tag(i)) { - i->iov_offset += size; - i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); @@ -1353,6 +1353,9 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); + if (iov_iter_is_dma_tag(i)) + return !(i->iov_offset & addr_mask); + if (iov_iter_is_pipe(i)) { unsigned int p_mask = i->pipe->ring_size - 1; size_t size = i->count; -- 8< --