[PATCH] block: add split_alignment for request queue

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This feature allows the user to control the alignment at which request
queue is allowed to split bios. Google CloudSQL's 16k user space
application expects that direct io writes aligned at 16k boundary in
the user-space are not split by kernel at non-16k boundaries. More
details about this feature can be found in CloudSQL's Cloud Next 2018
presentation[1]. The underlying block device is capable of performing
16k aligned writes atomically. Thus, this allows the user-space SQL
application to avoid double-writes (to protect against partial
failures) which are very costly provided that these writes are not
split at non-16k boundary by any underlying layers.

We make use of Ext4's bigalloc feature to ensure that writes issued by
Ext4 are 16k aligned. But, 16K aligned data writes may get merged with
contiguous non-16k aligned Ext4 metadata writes. Such a write request
would be broken by the kernel only guaranteeing that the individually
split requests are physical block size aligned.

We started observing a significant increase in 16k unaligned splits in
5.4. Bisect points to commit 07173c3ec276cbb18dc0e0687d37d310e98a1480
("block: enable multipage bvecs"). This patch enables multipage bvecs
resulting in multiple 16k aligned writes issued by the user-space to
be merged into one big IO at first. Later, __blk_queue_split() splits
these IOs while trying to align individual split IOs to be physical
block size.

Newly added split_alignment parameter is the alignment at which
requeust queue is allowed to split IO request. By default this
alignment is turned off and current behavior is unchanged.

[1] CloudNext'18 "Optimizaing performance for Cloud SQL for MySQL"
    https://www.youtube.com/watch?v=gIeuiGg-_iw

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx>
---
 Documentation/block/queue-sysfs.rst |  8 ++++
 block/blk-merge.c                   | 64 ++++++++++++++++++++++-------
 block/blk-sysfs.c                   | 30 ++++++++++++++
 include/linux/blkdev.h              |  1 +
 4 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index 6a8513af9201..c3eaba149415 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -251,4 +251,12 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC
 do not support zone commands, they will be treated as regular block devices
 and zoned will report "none".
 
+split_alignment (RW)
+----------------------
+This is the alignment in bytes at which the requeust queue is allowed
+to split IO requests. Once this value is set, the requeust queue
+splits IOs such that the individual IOs are aligned to
+split_alignment. The value of 0 indicates that an IO request can be
+split anywhere. This value must be a power of 2.
+
 Jens Axboe <jens.axboe@xxxxxxxxxx>, February 2009
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1534ed736363..cdf337c74b83 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,15 +105,18 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
 {
+	sector_t split;
+
 	*nsegs = 0;
 
-	if (!q->limits.max_write_zeroes_sectors)
-		return NULL;
+	split = q->limits.max_write_zeroes_sectors;
+	if (split && q->split_alignment >> 9)
+		split = round_down(split, q->split_alignment >> 9);
 
-	if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+	if (!split || bio_sectors(bio) <= split)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+	return bio_split(bio, split, GFP_NOIO, bs);
 }
 
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
@@ -121,15 +124,18 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio_set *bs,
 					    unsigned *nsegs)
 {
+	sector_t split;
+
 	*nsegs = 1;
 
-	if (!q->limits.max_write_same_sectors)
-		return NULL;
+	split = q->limits.max_write_same_sectors;
+	if (split && q->split_alignment >> 9)
+		split = round_down(split, q->split_alignment >> 9);
 
-	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
+	if (!split || bio_sectors(bio) <= split)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+	return bio_split(bio, split, GFP_NOIO, bs);
 }
 
 /*
@@ -248,7 +254,10 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned nsegs = 0, sectors = 0;
+	unsigned int nsegs = 0, nsegs_aligned = 0;
+	unsigned int sectors = 0, sectors_aligned = 0, before = 0, after = 0;
+	unsigned int sector_alignment =
+		q->split_alignment ? (q->split_alignment >> 9) : 0;
 	const unsigned max_sectors = get_max_io_size(q, bio);
 	const unsigned max_segs = queue_max_segments(q);
 
@@ -264,12 +273,31 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		    sectors + (bv.bv_len >> 9) <= max_sectors &&
 		    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 			nsegs++;
-			sectors += bv.bv_len >> 9;
-		} else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
-					 max_sectors)) {
-			goto split;
+			before = round_down(sectors, sector_alignment);
+			sectors += (bv.bv_len >> 9);
+			after = round_down(sectors, sector_alignment);
+			if (sector_alignment && before != after) {
+				/* This is a valid split point */
+				nsegs_aligned = nsegs;
+				sectors_aligned = after;
+			}
+			goto next;
 		}
-
+		if (sector_alignment) {
+			before = round_down(sectors, sector_alignment);
+			after = round_down(sectors + (bv.bv_len >> 9),
+					  sector_alignment);
+			if ((nsegs < max_segs) && before != after &&
+			    ((after - before) << 9) + bv.bv_offset <=  PAGE_SIZE
+			    && after <= max_sectors) {
+				sectors_aligned = after;
+				nsegs_aligned = nsegs + 1;
+			}
+		}
+		if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
+				    max_sectors))
+			goto split;
+next:
 		bvprv = bv;
 		bvprvp = &bvprv;
 	}
@@ -278,7 +306,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	return NULL;
 split:
 	*segs = nsegs;
-	return bio_split(bio, sectors, GFP_NOIO, bs);
+	if (sector_alignment && sectors_aligned == 0)
+		return NULL;
+
+	*segs = sector_alignment ? nsegs_aligned : nsegs;
+
+	return bio_split(bio, sector_alignment ? sectors_aligned : sectors,
+			 GFP_NOIO, bs);
 }
 
 /**
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fca9b158f4a0..f045c7a79a74 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -529,6 +529,29 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
+static ssize_t queue_split_alignment_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->split_alignment, page);
+}
+
+static ssize_t queue_split_alignment_store(struct request_queue *q, const char *page,
+						size_t count)
+{
+	unsigned long split_alignment;
+	int ret;
+
+	ret = queue_var_store(&split_alignment, page, count);
+	if (ret < 0)
+		return ret;
+
+	/* split_alignment can only be a power of 2 */
+	if (split_alignment & (split_alignment - 1))
+		return -EINVAL;
+
+	q->split_alignment = split_alignment;
+	return count;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = 0644 },
 	.show = queue_requests_show,
@@ -727,6 +750,12 @@ static struct queue_sysfs_entry throtl_sample_time_entry = {
 };
 #endif
 
+static struct queue_sysfs_entry queue_split_alignment = {
+	.attr = {.name = "split_alignment", .mode = 0644 },
+	.show = queue_split_alignment_show,
+	.store = queue_split_alignment_store,
+};
+
 static struct attribute *queue_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -766,6 +795,7 @@ static struct attribute *queue_attrs[] = {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	&throtl_sample_time_entry.attr,
 #endif
+	&queue_split_alignment.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 32868fbedc9e..e8feb43f6fdd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -473,6 +473,7 @@ struct request_queue {
 	void			*dma_drain_buffer;
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
+	unsigned int		split_alignment;
 
 	unsigned int		rq_timeout;
 	int			poll_nsec;
-- 
2.27.0.290.gba653c62da-goog




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux