blk_bio_segment_split() is very heavy, but the current fast path covers only one-segment under PAGE_SIZE bios. Add another one by estimating an upper bound of sectors a bio can contain. One restricting factor here is queue_max_segment_size(), which it compare against full iter size to not dig into bvecs. By default it's 64KB, and so for requests under 64KB, but for those falling under the conditions it's much faster. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- block/blk-merge.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 84b9635b5d57..15d75f3ffc30 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -226,12 +226,12 @@ static bool bvec_split_segs(const struct request_queue *q, static struct bio *__blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, - unsigned *segs) + unsigned *segs, + const unsigned max_sectors) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, sectors = 0; - const unsigned max_sectors = get_max_io_size(q, bio); const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) { @@ -295,6 +295,9 @@ static inline struct bio *blk_bio_segment_split(struct request_queue *q, struct bio_set *bs, unsigned *nr_segs) { + unsigned int max_sectors, q_max_sectors; + unsigned int bio_segs = bio->bi_vcnt; + /* * All drivers must accept single-segments bios that are <= * PAGE_SIZE. This is a quick and dirty check that relies on @@ -303,14 +306,32 @@ static inline struct bio *blk_bio_segment_split(struct request_queue *q, * are cloned, but compared to the performance impact of cloned * bios themselves the loop below doesn't matter anyway. */ - if (!q->limits.chunk_sectors && bio->bi_vcnt == 1 && + if (!q->limits.chunk_sectors && bio_segs == 1 && (bio->bi_io_vec[0].bv_len + bio->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { *nr_segs = 1; return NULL; } - return __blk_bio_segment_split(q, bio, bs, nr_segs); + q_max_sectors = get_max_io_size(q, bio); + if (!queue_virt_boundary(q) && bio_segs < queue_max_segments(q) && + bio->bi_iter.bi_size <= queue_max_segment_size(q)) { + /* + * Segments are contiguous, so only their ends may be not full. + * An upper bound for them would to assume that each takes 1B + * but adds a sector, and all left are just full sectors. + * Note: it's ok to round size down because all not full + * sectors are accounted by the first term. + */ + max_sectors = bio_segs * 2; + max_sectors += bio->bi_iter.bi_size >> 9; + + if (max_sectors < q_max_sectors) { + *nr_segs = bio_segs; + return NULL; + } + } + return __blk_bio_segment_split(q, bio, bs, nr_segs, q_max_sectors); } /** -- 2.24.0