ps: This commit is verified based on the deadline io scheduler and two blkcg configured[2]. The modified fs files are listed in [2] [1] /dev/blkio # cat background/blkio.prio.class no-change dev/blkio # cat foreground/blkio.prio.class none-to-rt [2] diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71..ef91374 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -50,6 +50,7 @@ #include <linux/fscrypt.h> #include <linux/fsverity.h> #include <linux/sched/isolation.h> +#include <linux/act_ioprio.h> #include "internal.h" @@ -2823,6 +2824,8 @@ wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); } + bio_set_active_ioprio(bio); + submit_bio(bio); } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a7e6847..53ae283 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -8,6 +8,7 @@ #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> +#include <linux/act_ioprio.h> #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 @@ -1688,6 +1689,7 @@ if (bio && (cur != last_index + 1 || last_bdev != mdev.m_bdev)) { submit_bio_retry: + bio_set_active_ioprio(bio); submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); @@ -1729,6 +1731,7 @@ } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { + bio_set_active_ioprio(bio); submit_bio(bio); if (memstall) psi_memstall_leave(&pflags); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dfdd7e5..400b6b5 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/sched/mm.h> +#include <linux/act_ioprio.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -375,6 +376,7 @@ if (bio) { if (io->io_wbc->sync_mode == WB_SYNC_ALL) io->io_bio->bi_opf |= REQ_SYNC; + bio_set_active_ioprio(io->io_bio); submit_bio(io->io_bio); } io->io_bio = NULL; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3e7d160..6374cf5a9 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -43,6 +43,7 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> +#include <linux/act_ioprio.h> #include "ext4.h" @@ -351,6 +352,7 @@ if (bio && (last_block_in_bio != blocks[0] - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: + bio_set_active_ioprio(bio); submit_bio(bio); bio = NULL; } @@ -377,6 +379,7 @@ if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { + bio_set_active_ioprio(bio); submit_bio(bio); bio = NULL; } else @@ -384,6 +387,7 @@ continue; confused: if (bio) { + bio_set_active_ioprio(bio); submit_bio(bio); bio = NULL; } @@ -394,8 +398,10 @@ next_page: ; /* A label shall be followed by a statement until C23 */ } - if (bio) + if (bio) { + bio_set_active_ioprio(bio); submit_bio(bio); + } return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2893054..6613e104 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -21,6 +21,7 @@ #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/iomap.h> +#include <linux/act_ioprio.h> #include "f2fs.h" #include "node.h" @@ -525,6 +526,7 @@ trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); + bio_set_active_ioprio(bio); submit_bio(bio); } @@ -575,6 +577,7 @@ trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); + bio_set_active_ioprio(bio); submit_bio(bio); } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 2bc0aa2..4a85690 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -17,6 +17,7 @@ #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include <linux/act_ioprio.h> #include "trace.h" #include "../internal.h" @@ -369,8 +370,10 @@ gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - if (ctx->bio) + if (ctx->bio) { + bio_set_active_ioprio(ctx->bio); submit_bio(ctx->bio); + } if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; @@ -423,6 +426,7 @@ folio_set_error(folio); if (ctx.bio) { + bio_set_active_ioprio(ctx.bio); submit_bio(ctx.bio); WARN_ON_ONCE(!ctx.cur_folio_in_bio); } else { @@ -495,8 +499,10 @@ while (iomap_iter(&iter, ops) > 0) iter.processed = iomap_readahead_iter(&iter, &ctx); - if (ctx.bio) + if (ctx.bio) { + bio_set_active_ioprio(ctx.bio); submit_bio(ctx.bio); + } if (ctx.cur_folio) { if (!ctx.cur_folio_in_bio) folio_unlock(ctx.cur_folio); @@ -624,6 +630,7 @@ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); bio_add_folio_nofail(&bio, folio, plen, poff); + bio_set_active_ioprio(&bio); return submit_bio_wait(&bio); } @@ -1648,7 +1655,7 @@ bio_endio(ioend->io_bio); return error; } - + bio_set_active_ioprio(ioend->io_bio); submit_bio(ioend->io_bio); return 0; } @@ -1697,6 +1704,7 @@ bio_chain(prev, new); bio_get(prev); /* for iomap_finish_ioend */ + bio_set_active_ioprio(prev); submit_bio(prev); return new; } diff --git a/fs/mpage.c b/fs/mpage.c index 242e213..bb1abe7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -29,6 +29,7 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> +#include <linux/act_ioprio.h> #include "internal.h" /* @@ -79,6 +80,7 @@ { bio->bi_end_io = mpage_read_end_io; guard_bio_eod(bio); + bio_set_active_ioprio(bio); submit_bio(bio); return NULL; } @@ -87,6 +89,7 @@ { bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); + bio_set_active_ioprio(bio); submit_bio(bio); return NULL; } On Mon, Feb 5, 2024 at 1:59 PM zhaoyang.huang <zhaoyang.huang@xxxxxxxxxx> wrote: > > From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx> > > Currently, request's ioprio are set via task's schedule priority(when no > blkcg configured), which has high priority tasks possess the privilege on > both of CPU and IO scheduling. Furthermore, most of the write requestes > are launched asynchronosly from kworker which can't know the submitter's > priorities. > This commit works as a hint of original policy by promoting the request > ioprio based on the page/folio's activity. The original idea comes from > LRU_GEN which provides more precised folio activity than before. This > commit try to adjust the request's ioprio when certain part of its folios > are hot, which indicate that this request carry important contents and > need be scheduled ealier. > The filesystem should call bio_set_active_ioprio() before submit_bio on the > spot where they want(buffered read/write/sync etc). > > This commit is verified on a v6.6 6GB RAM android14 system via 4 test cases > by calling bio_set_active_ioprio in erofs, ext4, f2fs and blkdev(raw > partition of gendisk) > > Case 1: > script[a] which get significant improved fault time as expected[b]* > where dd's cost also shrink from 55s to 40s. > (1). fault_latency.bin is an ebpf based test tool which measure all task's > iowait latency during page fault when scheduled out/in. > (2). costmem generate page fault by mmaping a file and access the VA. > (3). dd generate concurrent vfs io. > > [a] > ./fault_latency.bin 1 5 > /data/dd_costmem & > costmem -c0 -a2048000 -b128000 -o0 1>/dev/null & > costmem -c0 -a2048000 -b128000 -o0 1>/dev/null & > costmem -c0 -a2048000 -b128000 -o0 1>/dev/null & > costmem -c0 -a2048000 -b128000 -o0 1>/dev/null & > dd if=/dev/block/sda of=/data/ddtest bs=1024 count=2048000 & > dd if=/dev/block/sda of=/data/ddtest1 bs=1024 count=2048000 & > dd if=/dev/block/sda of=/data/ddtest2 bs=1024 count=2048000 & > dd if=/dev/block/sda of=/data/ddtest3 bs=1024 count=2048000 > [b] > mainline commit > io wait 736us 523us > > * provide correct result for test case 1 in v7 which was compared between > EMMC and UFS wrongly. > > Case 2: > fio -filename=/dev/block/by-name/userdata -rw=randread -direct=0 -bs=4k -size=2000M -numjobs=8 -group_reporting -name=mytest > mainline: 513MiB/s > READ: bw=531MiB/s (557MB/s), 531MiB/s-531MiB/s (557MB/s-557MB/s), io=15.6GiB (16.8GB), run=30137-30137msec > READ: bw=543MiB/s (569MB/s), 543MiB/s-543MiB/s (569MB/s-569MB/s), io=15.6GiB (16.8GB), run=29469-29469msec > READ: bw=474MiB/s (497MB/s), 474MiB/s-474MiB/s (497MB/s-497MB/s), io=15.6GiB (16.8GB), run=33724-33724msec > READ: bw=535MiB/s (561MB/s), 535MiB/s-535MiB/s (561MB/s-561MB/s), io=15.6GiB (16.8GB), run=29928-29928msec > READ: bw=523MiB/s (548MB/s), 523MiB/s-523MiB/s (548MB/s-548MB/s), io=15.6GiB (16.8GB), run=30617-30617msec > READ: bw=492MiB/s (516MB/s), 492MiB/s-492MiB/s (516MB/s-516MB/s), io=15.6GiB (16.8GB), run=32518-32518msec > READ: bw=533MiB/s (559MB/s), 533MiB/s-533MiB/s (559MB/s-559MB/s), io=15.6GiB (16.8GB), run=29993-29993msec > READ: bw=524MiB/s (550MB/s), 524MiB/s-524MiB/s (550MB/s-550MB/s), io=15.6GiB (16.8GB), run=30526-30526msec > READ: bw=529MiB/s (554MB/s), 529MiB/s-529MiB/s (554MB/s-554MB/s), io=15.6GiB (16.8GB), run=30269-30269msec > READ: bw=449MiB/s (471MB/s), 449MiB/s-449MiB/s (471MB/s-471MB/s), io=15.6GiB (16.8GB), run=35629-35629msec > > commit: 633MiB/s > READ: bw=668MiB/s (700MB/s), 668MiB/s-668MiB/s (700MB/s-700MB/s), io=15.6GiB (16.8GB), run=23952-23952msec > READ: bw=589MiB/s (618MB/s), 589MiB/s-589MiB/s (618MB/s-618MB/s), io=15.6GiB (16.8GB), run=27164-27164msec > READ: bw=638MiB/s (669MB/s), 638MiB/s-638MiB/s (669MB/s-669MB/s), io=15.6GiB (16.8GB), run=25071-25071msec > READ: bw=714MiB/s (749MB/s), 714MiB/s-714MiB/s (749MB/s-749MB/s), io=15.6GiB (16.8GB), run=22409-22409msec > READ: bw=600MiB/s (629MB/s), 600MiB/s-600MiB/s (629MB/s-629MB/s), io=15.6GiB (16.8GB), run=26669-26669msec > READ: bw=592MiB/s (621MB/s), 592MiB/s-592MiB/s (621MB/s-621MB/s), io=15.6GiB (16.8GB), run=27036-27036msec > READ: bw=691MiB/s (725MB/s), 691MiB/s-691MiB/s (725MB/s-725MB/s), io=15.6GiB (16.8GB), run=23150-23150msec > READ: bw=569MiB/s (596MB/s), 569MiB/s-569MiB/s (596MB/s-596MB/s), io=15.6GiB (16.8GB), run=28142-28142msec > READ: bw=563MiB/s (590MB/s), 563MiB/s-563MiB/s (590MB/s-590MB/s), io=15.6GiB (16.8GB), run=28429-28429msec > READ: bw=712MiB/s (746MB/s), 712MiB/s-712MiB/s (746MB/s-746MB/s), io=15.6GiB (16.8GB), run=22478-22478msec > > Case 3: > This commit is also verified by the case of launching camera APP which is > usually considered as heavy working load on both of memory and IO, which > shows 12%-24% improvement. > > ttl = 0 ttl = 50 ttl = 100 > mainline 2267ms 2420ms 2316ms > commit 1992ms 1806ms 1998ms > > case 4: > androbench has no improvment as well as regression in RD/WR test item > while make a 3% improvement in sqlite items. > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx> > --- > change of v2: calculate page's activity via helper function > change of v3: solve layer violation by move API into mm > change of v4: keep block clean by removing the page related API > change of v5: introduce the macros of bio_add_folio/page for read dir. > change of v6: replace the macro of bio_add_xxx by submit_bio which > iterating the bio_vec before launching bio to block layer > change of v7: introduce the function bio_set_active_ioprio > provide updated test result > --- > --- > block/Kconfig | 15 ++++++++++++ > block/bio.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ > include/linux/bio.h | 1 + > 3 files changed, 72 insertions(+) > > diff --git a/block/Kconfig b/block/Kconfig > index f1364d1c0d93..2dbcd8d92052 100644 > --- a/block/Kconfig > +++ b/block/Kconfig > @@ -228,6 +228,21 @@ config BLOCK_HOLDER_DEPRECATED > config BLK_MQ_STACKING > bool > > +config BLK_CONT_ACT_BASED_IOPRIO > + bool "Enable content activity based ioprio" > + depends on LRU_GEN > + default n > + help > + This item enable the feature of adjust bio's priority by > + calculating its content's activity. > + This feature works as a hint of original bio_set_ioprio > + which means rt task get no change of its bio->bi_ioprio > + while other tasks have the opportunity to raise the ioprio > + if the bio take certain numbers of active pages. > + The file system should use this by modifying their buffered > + read/write/sync function to raise the bio->bi_ioprio before > + calling submit_bio. > + > source "block/Kconfig.iosched" > > endif # BLOCK > diff --git a/block/bio.c b/block/bio.c > index 816d412c06e9..9e2d1e018c98 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1476,6 +1476,62 @@ void bio_set_pages_dirty(struct bio *bio) > } > EXPORT_SYMBOL_GPL(bio_set_pages_dirty); > > +/* > + * bio_set_active_ioprio() is helper function for fs to adjust the bio's ioprio via > + * calculating the content's activity which measured from MGLRU. > + * The file system should call this function before submit_bio for the buffered > + * read/write/sync. > + */ > +#ifdef CONFIG_BLK_CONT_ACT_BASED_IOPRIO > +void bio_set_active_ioprio(struct bio *bio) > +{ > + struct bio_vec bv; > + struct bvec_iter iter; > + struct page *page; > + int class, level, hint; > + int activity = 0; > + int cnt = 0; > + > + class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); > + level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio); > + hint = IOPRIO_PRIO_HINT(bio->bi_ioprio); > + /*apply legacy ioprio policy on RT task*/ > + if (task_is_realtime(current)) { > + bio->bi_ioprio = IOPRIO_PRIO_VALUE_HINT(IOPRIO_CLASS_RT, level, hint); > + return; > + } > + bio_for_each_bvec(bv, bio, iter) { > + page = bv.bv_page; > + activity += PageWorkingset(page) ? 1 : 0; > + cnt++; > + if (activity > bio->bi_vcnt / 2) { > + class = IOPRIO_CLASS_RT; > + break; > + } else if (activity > bio->bi_vcnt / 4) { > + /* > + * all itered pages are all active so far > + * then raise to RT directly > + */ > + if (activity == cnt) { > + class = IOPRIO_CLASS_RT; > + break; > + } else > + class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()), > + IOPRIO_CLASS_BE); > + } > + } > + if (!class && activity > cnt / 2) > + class = IOPRIO_CLASS_RT; > + else if (!class && activity > cnt / 4) > + class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()), IOPRIO_CLASS_BE); > + > + bio->bi_ioprio = IOPRIO_PRIO_VALUE_HINT(class, level, hint); > +} > +#else > +void bio_set_active_ioprio(struct bio *bio) {} > +#endif > +EXPORT_SYMBOL_GPL(bio_set_active_ioprio); > + > /* > * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. > * If they are, then fine. If, however, some pages are clean then they must > diff --git a/include/linux/bio.h b/include/linux/bio.h > index 41d417ee1349..58e33d8b0f5f 100644 > --- a/include/linux/bio.h > +++ b/include/linux/bio.h > @@ -487,6 +487,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); > void __bio_release_pages(struct bio *bio, bool mark_dirty); > extern void bio_set_pages_dirty(struct bio *bio); > extern void bio_check_pages_dirty(struct bio *bio); > +extern void bio_set_active_ioprio(struct bio *bio); > > extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, > struct bio *src, struct bvec_iter *src_iter); > -- > 2.25.1 >