> On Thu, 2021-01-21 at 18:36 +0900, Changheun Lee wrote: > > > Please drop the "." at the end of the patch title. > > > > > > > bio size can grow up to 4GB when muli-page bvec is enabled. > > > > but sometimes it would lead to inefficient behaviors. > > > > in case of large chunk direct I/O, - 32MB chunk read in user space - > > > > all pages for 32MB would be merged to a bio structure if memory address > > > > is > > > > continued phsycally. it makes some delay to submit until merge complete. > > > > > > s/if memory address is continued phsycally/if the pages physical addresses > > > are > > > contiguous/ > > > > > > > bio max size should be limited as a proper size. > > > > > > s/as/to/ > > > > Thank you for advice. :) > > > > > > > > > > > > > When 32MB chunk read with direct I/O option is coming from userspace, > > > > kernel behavior is below now. it's timeline. > > > > > > > > | bio merge for 32MB. total 8,192 pages are merged. > > > > | total elapsed time is over 2ms. > > > > |------------------ ... ----------------------->| > > > > | 8,192 pages merged a > > > > bio. > > > > | at this time, first > > > > bio submit is done. > > > > | 1 bio is split to 32 > > > > read request and issue. > > > > |---------------> > > > > |---------------> > > > > |---------------> > > > > ...... > > > > |----- > > > > ----------> > > > > |---- > > > > ----------->| > > > > total 19ms elapsed to complete 32MB read done > > > > from device. | > > > > > > > > If bio max size is limited with 1MB, behavior is changed below. > > > > > > > > | bio merge for 1MB. 256 pages are merged for each bio. > > > > | total 32 bio will be made. > > > > | total elapsed time is over 2ms. it's same. > > > > | but, first bio submit timing is fast. about 100us. > > > > |--->|--->|--->|---> ... -->|--->|--->|--->|--->| > > > > | 256 pages merged a bio. > > > > | at this time, first bio submit is done. > > > > | and 1 read request is issued for 1 bio. > > > > |---------------> > > > > |---------------> > > > > |---------------> > > > > ...... > > > > |---------------> > > > > |--------------->| > > > > total 17ms elapsed to complete 32MB read done from device. | > > > > > > > > As a result, read request issue timing is faster if bio max size is > > > > limited. > > > > Current kernel behavior with multipage bvec, super large bio can be > > > > created. > > > > And it lead to delay first I/O request issue. > > > > > > > > Signed-off-by: Changheun Lee <nanich.lee@xxxxxxxxxxx> > > > > > > > > --- > > > > block/bio.c | 17 ++++++++++++++++- > > > > include/linux/bio.h | 13 +++---------- > > > > include/linux/blk_types.h | 1 + > > > > 3 files changed, 20 insertions(+), 11 deletions(-) > > > > > > > > diff --git a/block/bio.c b/block/bio.c > > > > index 1f2cc1fbe283..027503c2e2e7 100644 > > > > --- a/block/bio.c > > > > +++ b/block/bio.c > > > > @@ -284,9 +284,24 @@ void bio_init(struct bio *bio, struct bio_vec > > > > *table, > > > > > > > > bio->bi_io_vec = table; > > > > bio->bi_max_vecs = max_vecs; > > > > + bio->bi_max_size = UINT_MAX; > > > > } > > > > EXPORT_SYMBOL(bio_init); > > > > > > > > +void bio_set_dev(struct bio *bio, struct block_device *bdev) > > > > +{ > > > > + if (bio->bi_disk != bdev->bd_disk) > > > > + bio_clear_flag(bio, BIO_THROTTLED); > > > > + > > > > + bio->bi_disk = bdev->bd_disk; > > > > + bio->bi_partno = bdev->bd_partno; > > > > + bio->bi_max_size = blk_queue_get_max_sectors(bio->bi_disk- > > > > >queue, > > > > + bio_op(bio)) << SECTOR_SHIFT; > > > > + > > > > + bio_associate_blkg(bio); > > > > +} > > > > +EXPORT_SYMBOL(bio_set_dev); > > > > + > > > > /** > > > > * bio_reset - reinitialize a bio > > > > * @bio: bio to reset > > > > @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct > > > > page *page, > > > > struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; > > > > > > > > if (page_is_mergeable(bv, page, len, off, same_page)) { > > > > - if (bio->bi_iter.bi_size > UINT_MAX - len) { > > > > + if (bio->bi_iter.bi_size > bio->bi_max_size - > > > > len) > > > > *same_page = false; > > > > return false; > > > > } > > > > diff --git a/include/linux/bio.h b/include/linux/bio.h > > > > index 1edda614f7ce..b9803e80c259 100644 > > > > --- a/include/linux/bio.h > > > > +++ b/include/linux/bio.h > > > > @@ -113,7 +113,7 @@ static inline bool bio_full(struct bio *bio, unsigned > > > > len) > > > > if (bio->bi_vcnt >= bio->bi_max_vecs) > > > > return true; > > > > > > > > - if (bio->bi_iter.bi_size > UINT_MAX - len) > > > > + if (bio->bi_iter.bi_size > bio->bi_max_size - len) > > > > return true; > > > > > > > > return false; > > > > @@ -482,20 +482,13 @@ extern struct bio_vec *bvec_alloc(gfp_t, int, > > > > unsigned long *, mempool_t *); > > > > extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); > > > > extern unsigned int bvec_nr_vecs(unsigned short idx); > > > > extern const char *bio_devname(struct bio *bio, char *buffer); > > > > - > > > > -#define bio_set_dev(bio, bdev) \ > > > > -do { \ > > > > - if ((bio)->bi_disk != (bdev)->bd_disk) \ > > > > - bio_clear_flag(bio, BIO_THROTTLED);\ > > > > - (bio)->bi_disk = (bdev)->bd_disk; \ > > > > - (bio)->bi_partno = (bdev)->bd_partno; \ > > > > - bio_associate_blkg(bio); \ > > > > -} while (0) > > > > +extern void bio_set_dev(struct bio *bio, struct block_device *bdev); > > > > > > > > #define bio_copy_dev(dst, src) \ > > > > do { \ > > > > (dst)->bi_disk = (src)->bi_disk; \ > > > > (dst)->bi_partno = (src)->bi_partno; \ > > > > + (dst)->bi_max_size = (src)->bi_max_size;\ > > > > bio_clone_blkg_association(dst, src); \ > > > > } while (0) > > > > > > > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > > > > index 866f74261b3b..e5dd5b7d8fc1 100644 > > > > --- a/include/linux/blk_types.h > > > > +++ b/include/linux/blk_types.h > > > > @@ -270,6 +270,7 @@ struct bio { > > > > */ > > > > > > > > unsigned short bi_max_vecs; /* max bvl_vecs we can > > > > hold */ > > > > + unsigned int bi_max_size; /* max data size we can > > > > hold */ > > > > > > > > atomic_t __bi_cnt; /* pin count */ > > > > > > This modification comes at the cost of increasing the bio structure size to > > > simply tell the block layer "do not delay BIO splitting"... > > > > > > I think there is a much simpler approach. What about: > > > > > > 1) Use a request queue flag to indicate "limit BIO size" > > > 2) modify __bio_try_merge_page() to look at that flag to disallow page > > > merging > > > if the bio size exceeds blk_queue_get_max_sectors(), or more ideally a > > > version > > > of it that takes into account the bio start sector. > > > 3) Set the "limit bio size" queue flag in the driver of the device that > > > benefit > > > from this change. Eventually, that could also be controlled through sysfs. > > > > > > With such change, you will get the same result without having to increase > > > the > > > BIO structure size. > > > > I have a qustion. > > Is adding new variable in bio not possible? > > It is possible, but since it is a critical kernel resource used a lot, keeping > it as small as possible for performance reasons is strongly desired. So if > there is a coding scheme that can avoid increasing struct bio size, it should > be explored first and discarded only with very good reasons. I see your point. I agree with you. it's important thing. :) > > > Additional check for every page merge like as below is inefficient I think. > > For the general case of devices that do not care about limiting the bio size > (like now), this will add one boolean evaluation (queue flag test). That's it. > For your case, sure you now have 2 boolean evals instead of one. But that must > be put in perspective with the cost of increasing the bio size. > > > > > bool __bio_try_merge_page(struct bio *bio, struct page *page, > > unsigned int len, unsigned int off, bool *same_page) > > { > > ... > > if (page_is_mergeable(bv, page, len, off, same_page)) { > > if (bio->bi_iter.bi_size > UINT_MAX - len) { > > *same_page = false; > > return false; > > } > > > > + if (blk_queue_limit_bio_max_size(bio) && > > + (bio->bi_iter.bi_size > > > blk_queue_get_bio_max_size(bio) - len)) { > > + *same_page = false; > > + return false; > > + } > > > > bv->bv_len += len; > > bio->bi_iter.bi_size += len; > > return true; > > } > > ... > > } > > > > > > static inline bool bio_full(struct bio *bio, unsigned len) > > { > > ... > > if (bio->bi_iter.bi_size > UINT_MAX - len) > > return true; > > > > + if (blk_queue_limit_bio_max_size(bio) && > > + (bio->bi_iter.bi_size > blk_queue_get_bio_max_size(bio) - len)) > > + return true; > > ... > > } > > > > > > Page merge is CPU-bound job as you said. > > How about below with adding of bi_max_size in bio? > > I am not a fan of adding a bio field for using it only in one place. > This is only my opinion. I will let others comment about this, but personnally > I would rather do something like this: > > #define blk_queue_limit_bio_merge_size(q) \ > test_bit(QUEUE_FLAG_LIMIT_MERGE, &(q)->queue_flags) > > static inline unsigned int bio_max_merge_size(struct bio *bio) > { > struct request_queue *q = bio->bi_disk->queue; > > if (blk_queue_limit_bio_merge_size(q)) > return blk_queue_get_max_sectors(q, bio_op(bio)) > << SECTOR_SHIFT; > return UINT_MAX; > } > > and use that helper in __bio_try_merge_page(), e.g.: > > if (bio->bi_iter.bi_size > bio_max_merge_size(bio) - len) { > *same_page = false; > return false; > } > > No need to change the bio struct. > > If you measure performance with and without this change on nullblk, you can > verify if it has any impact for regular devices. And for your use case, that > should give you the same performance. > OK. I'll wait others comment too for a few days. I'll prepare v3 patch as you like if there are no feedback. :) v2 patch has a compile error already by my misstyping. :( > > > > bool __bio_try_merge_page(struct bio *bio, struct page *page, > > unsigned int len, unsigned int off, bool *same_page) > > { > > ... > > if (page_is_mergeable(bv, page, len, off, same_page)) { > > - if (bio->bi_iter.bi_size > UINT_MAX - len) { > > + if (bio->bi_iter.bi_size > bio->bi_max_size - len) { > > *same_page = false; > > return false; > > } > > > > bv->bv_len += len; > > bio->bi_iter.bi_size += len; > > return true; > > } > > ... > > } > > > > > > static inline bool bio_full(struct bio *bio, unsigned len) > > { > > ... > > - if (bio->bi_iter.bi_size > UINT_MAX - len) > > + if (bio->bi_iter.bi_size > bio->bi_max_size - len) > > return true; > > ... > > } > > > > +void bio_set_dev(struct bio *bio, struct block_device *bdev) > > +{ > > + if (bio->bi_disk != bdev->bd_disk) > > + bio_clear_flag(bio, BIO_THROTTLED); > > + > > + bio->bi_disk = bdev->bd_disk; > > + bio->bi_partno = bdev->bd_partno; > > + if (blk_queue_limit_bio_max_size(bio)) > > + bio->bi_max_size = blk_queue_get_bio_max_size(bio); > > + > > + bio_associate_blkg(bio); > > +} > > +EXPORT_SYMBOL(bio_set_dev); > > > > > -- > > > Damien Le Moal > > > Western Digital Research > > > > --- > > Changheun Lee > > Samsung Electronics > > -- > Damien Le Moal > Western Digital > --- Changheun Lee Samsung Electronics