[PATCH V2 2/2] block: try to make aligned bio in case of big chunk IO

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



In case of big chunk sequential IO, bio size is often not aligned with
queue's max IO size because of multipage bvec, and unaligned & small bio
can be caused by bio split, then sequential IO perf drops, so try to align
bio with max IO size for avoiding this issue.

Provide 'max_size' hint to iov_iter_extract_pages() when this bio is close
to be full, and try to keep bio aligned with max IO size, so that we can
minimize bio & iov_iter revert. In my 1GB IO test over VM with 2G ram,
when memory becomes highly fragmented, revert ratio(revert bytes/buf size)
can be kept as small as 0.5% with this algorithm.

Ed Tsai reported that this change improves 64MB read/write by > 15%~25% in
Antutu V10 Storage Test.

Reported-by: Ed Tsai <ed.tsai@xxxxxxxxxxxx>
Closes: https://lore.kernel.org/linux-block/20231025092255.27930-1-ed.tsai@xxxxxxxxxxxx/
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
 block/bio.c            | 116 +++++++++++++++++++++++++++++++++++++++--
 include/linux/blkdev.h |   5 ++
 2 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 09a5e71a0372..e360ac052764 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1210,6 +1210,57 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+/*
+ * Figure out max_size hint of iov_iter_extract_pages() for minimizing
+ * bio & iov iter revert so that bio can be aligned with max io size.
+ */
+static unsigned int bio_get_buffer_size_hint(const struct bio *bio,
+		unsigned int left)
+{
+	unsigned int nr_bvecs = bio->bi_max_vecs - bio->bi_vcnt;
+	unsigned int size, predicted_space, max_bytes;
+	unsigned int space = nr_bvecs << PAGE_SHIFT;
+	unsigned int align_deviation;
+
+	/* If we have enough space really, just try to get all pages */
+	if (!bio->bi_bdev || nr_bvecs >= (bio->bi_max_vecs / 4) ||
+			!bio->bi_vcnt || left <= space)
+		return UINT_MAX - size;
+
+	max_bytes = bdev_max_io_bytes(bio->bi_bdev);
+	size = bio->bi_iter.bi_size;
+
+	/*
+	 * One bvec can hold physically continuous page frames with
+	 * multipage bvec and bytes in these pages can be pretty big, so
+	 * predict the available space by averaging bytes on all bvecs
+	 */
+	predicted_space = size * nr_bvecs / bio->bi_vcnt;
+	/*
+	 * If predicted space is bigger than max io bytes and at least two
+	 * vectors left, ask for all pages
+	 */
+	if (predicted_space > max_bytes && nr_bvecs > 2)
+		return UINT_MAX - size;
+
+	/*
+	 * This bio is close to be full, and stop to add pages if it is
+	 * basically aligned, otherwise just get & add pages if the bio
+	 * can be kept as aligned, so that we can minimize bio & iov iter
+	 * revert
+	 */
+	align_deviation = max_t(unsigned int, 16U * 1024, max_bytes / 16);
+	if ((size & (max_bytes - 1)) > align_deviation) {
+		unsigned aligned_bytes = max_bytes - (size & (max_bytes - 1));
+
+		/* try to keep bio aligned if we have enough data and space */
+		if (aligned_bytes <= left && aligned_bytes <= predicted_space)
+			return aligned_bytes;
+	}
+
+	return 0;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -1229,7 +1280,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
-	ssize_t size, left;
+	ssize_t size, left, max_size;
 	unsigned len, i = 0;
 	size_t offset;
 	int ret = 0;
@@ -1245,6 +1296,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
 		extraction_flags |= ITER_ALLOW_P2PDMA;
 
+	max_size = bio_get_buffer_size_hint(bio, iov_iter_count(iter));
+	if (!max_size)
+		return -E2BIG;
+
 	/*
 	 * Each segment in the iov is required to be a block size multiple.
 	 * However, we may not be able to get the entire segment if it spans
@@ -1252,8 +1307,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	 * result to ensure the bio's total size is correct. The remainder of
 	 * the iov data will be picked up in the next bio iteration.
 	 */
-	size = iov_iter_extract_pages(iter, &pages,
-				      UINT_MAX - bio->bi_iter.bi_size,
+	size = iov_iter_extract_pages(iter, &pages, max_size,
 				      nr_pages, extraction_flags, &offset);
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
@@ -1298,6 +1352,46 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	return ret;
 }
 
+/* should only be called before submission */
+static void bio_shrink(struct bio *bio, unsigned bytes)
+{
+	unsigned int size = bio->bi_iter.bi_size;
+	int idx;
+
+	if (bytes >= size)
+		return;
+
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+
+	idx = bio->bi_vcnt - 1;
+	bio->bi_iter.bi_size -= bytes;
+	while (bytes > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[idx];
+		unsigned int len = min_t(unsigned, bv->bv_len, bytes);
+
+		bytes -= len;
+		bv->bv_len -= len;
+		if (!bv->bv_len) {
+			bio_release_page(bio, bv->bv_page);
+			idx--;
+		}
+	}
+	WARN_ON_ONCE(idx < 0);
+	bio->bi_vcnt = idx + 1;
+}
+
+static unsigned bio_align_with_io_size(struct bio *bio)
+{
+	unsigned int size = bio->bi_iter.bi_size;
+	unsigned int trim = size & (bdev_max_io_bytes(bio->bi_bdev) - 1);
+
+	if (trim && trim != size) {
+		bio_shrink(bio, trim);
+		return trim;
+	}
+	return 0;
+}
+
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
@@ -1337,6 +1431,22 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
+
+	/*
+	 * If we still have data and bio is full, this bio size may not be
+	 * aligned with max io size, small bio can be caused by split, try
+	 * to avoid this situation by aligning bio with max io size.
+	 *
+	 * Big chunk of sequential IO workload can benefit from this way.
+	 */
+	if (!ret && iov_iter_count(iter) && bio->bi_bdev &&
+			bio_op(bio) != REQ_OP_ZONE_APPEND) {
+		unsigned trim = bio_align_with_io_size(bio);
+
+		if (trim)
+			iov_iter_revert(iter, trim);
+	}
+
 	return bio->bi_vcnt ? 0 : ret;
 }
 EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eef450f25982..2d275cdc39d8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1151,6 +1151,11 @@ static inline unsigned queue_logical_block_size(const struct request_queue *q)
 	return retval;
 }
 
+static inline unsigned int bdev_max_io_bytes(struct block_device *bdev)
+{
+	return queue_max_bytes(bdev_get_queue(bdev));
+}
+
 static inline unsigned int bdev_logical_block_size(struct block_device *bdev)
 {
 	return queue_logical_block_size(bdev_get_queue(bdev));
-- 
2.41.0





[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux