Re: [PATCHv7 1/1] block: introduce content activity based ioprio

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



ps: This commit is verified based on the deadline io scheduler and two
blkcg configured[2]. The modified fs files are listed in [2]

[1]
/dev/blkio # cat background/blkio.prio.class
no-change
dev/blkio # cat foreground/blkio.prio.class
none-to-rt

[2]

diff --git a/fs/buffer.c b/fs/buffer.c
index 12e9a71..ef91374 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -50,6 +50,7 @@
 #include <linux/fscrypt.h>
 #include <linux/fsverity.h>
 #include <linux/sched/isolation.h>
+#include <linux/act_ioprio.h>

 #include "internal.h"

@@ -2823,6 +2824,8 @@
  wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
  }

+ bio_set_active_ioprio(bio);
+
  submit_bio(bio);
 }

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a7e6847..53ae283 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -8,6 +8,7 @@
 #include <linux/psi.h>
 #include <linux/cpuhotplug.h>
 #include <trace/events/erofs.h>
+#include <linux/act_ioprio.h>

 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
 #define Z_EROFS_INLINE_BVECS 2
@@ -1688,6 +1689,7 @@
  if (bio && (cur != last_index + 1 ||
     last_bdev != mdev.m_bdev)) {
 submit_bio_retry:
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  if (memstall) {
  psi_memstall_leave(&pflags);
@@ -1729,6 +1731,7 @@
  } while (owned_head != Z_EROFS_PCLUSTER_TAIL);

  if (bio) {
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  if (memstall)
  psi_memstall_leave(&pflags);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5..400b6b5 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/act_ioprio.h>

 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -375,6 +376,7 @@
  if (bio) {
  if (io->io_wbc->sync_mode == WB_SYNC_ALL)
  io->io_bio->bi_opf |= REQ_SYNC;
+ bio_set_active_ioprio(io->io_bio);
  submit_bio(io->io_bio);
  }
  io->io_bio = NULL;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3e7d160..6374cf5a9 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -43,6 +43,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/act_ioprio.h>

 #include "ext4.h"

@@ -351,6 +352,7 @@
  if (bio && (last_block_in_bio != blocks[0] - 1 ||
     !fscrypt_mergeable_bio(bio, inode, next_block))) {
  submit_and_realloc:
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  bio = NULL;
  }
@@ -377,6 +379,7 @@
  if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
      (relative_block == map.m_len)) ||
     (first_hole != blocks_per_page)) {
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  bio = NULL;
  } else
@@ -384,6 +387,7 @@
  continue;
  confused:
  if (bio) {
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  bio = NULL;
  }
@@ -394,8 +398,10 @@
 next_page:
  ; /* A label shall be followed by a statement until C23 */
  }
- if (bio)
+ if (bio) {
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
+ }
  return 0;
 }

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2893054..6613e104 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -21,6 +21,7 @@
 #include <linux/sched/signal.h>
 #include <linux/fiemap.h>
 #include <linux/iomap.h>
+#include <linux/act_ioprio.h>

 #include "f2fs.h"
 #include "node.h"
@@ -525,6 +526,7 @@
  trace_f2fs_submit_read_bio(sbi->sb, type, bio);

  iostat_update_submit_ctx(bio, type);
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
 }

@@ -575,6 +577,7 @@

  trace_f2fs_submit_write_bio(sbi->sb, type, bio);
  iostat_update_submit_ctx(bio, type);
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
 }

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2bc0aa2..4a85690 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/sched/signal.h>
 #include <linux/migrate.h>
+#include <linux/act_ioprio.h>
 #include "trace.h"

 #include "../internal.h"
@@ -369,8 +370,10 @@
  gfp_t orig_gfp = gfp;
  unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);

- if (ctx->bio)
+ if (ctx->bio) {
+ bio_set_active_ioprio(ctx->bio);
  submit_bio(ctx->bio);
+ }

  if (ctx->rac) /* same as readahead_gfp_mask */
  gfp |= __GFP_NORETRY | __GFP_NOWARN;
@@ -423,6 +426,7 @@
  folio_set_error(folio);

  if (ctx.bio) {
+ bio_set_active_ioprio(ctx.bio);
  submit_bio(ctx.bio);
  WARN_ON_ONCE(!ctx.cur_folio_in_bio);
  } else {
@@ -495,8 +499,10 @@
  while (iomap_iter(&iter, ops) > 0)
  iter.processed = iomap_readahead_iter(&iter, &ctx);

- if (ctx.bio)
+ if (ctx.bio) {
+ bio_set_active_ioprio(ctx.bio);
  submit_bio(ctx.bio);
+ }
  if (ctx.cur_folio) {
  if (!ctx.cur_folio_in_bio)
  folio_unlock(ctx.cur_folio);
@@ -624,6 +630,7 @@
  bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
  bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
  bio_add_folio_nofail(&bio, folio, plen, poff);
+ bio_set_active_ioprio(&bio);
  return submit_bio_wait(&bio);
 }

@@ -1648,7 +1655,7 @@
  bio_endio(ioend->io_bio);
  return error;
  }
-
+ bio_set_active_ioprio(ioend->io_bio);
  submit_bio(ioend->io_bio);
  return 0;
 }
@@ -1697,6 +1704,7 @@

  bio_chain(prev, new);
  bio_get(prev); /* for iomap_finish_ioend */
+ bio_set_active_ioprio(prev);
  submit_bio(prev);
  return new;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 242e213..bb1abe7 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -29,6 +29,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/act_ioprio.h>
 #include "internal.h"

 /*
@@ -79,6 +80,7 @@
 {
  bio->bi_end_io = mpage_read_end_io;
  guard_bio_eod(bio);
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  return NULL;
 }
@@ -87,6 +89,7 @@
 {
  bio->bi_end_io = mpage_write_end_io;
  guard_bio_eod(bio);
+ bio_set_active_ioprio(bio);
  submit_bio(bio);
  return NULL;
 }

On Mon, Feb 5, 2024 at 1:59 PM zhaoyang.huang <zhaoyang.huang@xxxxxxxxxx> wrote:
>
> From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
>
> Currently, request's ioprio are set via task's schedule priority(when no
> blkcg configured), which has high priority tasks possess the privilege on
> both of CPU and IO scheduling. Furthermore, most of the write requestes
> are launched asynchronosly from kworker which can't know the submitter's
> priorities.
> This commit works as a hint of original policy by promoting the request
> ioprio based on the page/folio's activity. The original idea comes from
> LRU_GEN which provides more precised folio activity than before. This
> commit try to adjust the request's ioprio when certain part of its folios
> are hot, which indicate that this request carry important contents and
> need be scheduled ealier.
> The filesystem should call bio_set_active_ioprio() before submit_bio on the
> spot where they want(buffered read/write/sync etc).
>
> This commit is verified on a v6.6 6GB RAM android14 system via 4 test cases
> by calling bio_set_active_ioprio in erofs, ext4, f2fs and blkdev(raw
> partition of gendisk)
>
> Case 1:
> script[a] which get significant improved fault time as expected[b]*
> where dd's cost also shrink from 55s to 40s.
> (1). fault_latency.bin is an ebpf based test tool which measure all task's
>    iowait latency during page fault when scheduled out/in.
> (2). costmem generate page fault by mmaping a file and access the VA.
> (3). dd generate concurrent vfs io.
>
> [a]
> ./fault_latency.bin 1 5 > /data/dd_costmem &
> costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
> costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
> costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
> costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
> dd if=/dev/block/sda of=/data/ddtest bs=1024 count=2048000 &
> dd if=/dev/block/sda of=/data/ddtest1 bs=1024 count=2048000 &
> dd if=/dev/block/sda of=/data/ddtest2 bs=1024 count=2048000 &
> dd if=/dev/block/sda of=/data/ddtest3 bs=1024 count=2048000
> [b]
>                        mainline         commit
> io wait                736us            523us
>
> * provide correct result for test case 1 in v7 which was compared between
> EMMC and UFS wrongly.
>
> Case 2:
> fio -filename=/dev/block/by-name/userdata -rw=randread -direct=0 -bs=4k -size=2000M -numjobs=8 -group_reporting -name=mytest
> mainline: 513MiB/s
> READ: bw=531MiB/s (557MB/s), 531MiB/s-531MiB/s (557MB/s-557MB/s), io=15.6GiB (16.8GB), run=30137-30137msec
> READ: bw=543MiB/s (569MB/s), 543MiB/s-543MiB/s (569MB/s-569MB/s), io=15.6GiB (16.8GB), run=29469-29469msec
> READ: bw=474MiB/s (497MB/s), 474MiB/s-474MiB/s (497MB/s-497MB/s), io=15.6GiB (16.8GB), run=33724-33724msec
> READ: bw=535MiB/s (561MB/s), 535MiB/s-535MiB/s (561MB/s-561MB/s), io=15.6GiB (16.8GB), run=29928-29928msec
> READ: bw=523MiB/s (548MB/s), 523MiB/s-523MiB/s (548MB/s-548MB/s), io=15.6GiB (16.8GB), run=30617-30617msec
> READ: bw=492MiB/s (516MB/s), 492MiB/s-492MiB/s (516MB/s-516MB/s), io=15.6GiB (16.8GB), run=32518-32518msec
> READ: bw=533MiB/s (559MB/s), 533MiB/s-533MiB/s (559MB/s-559MB/s), io=15.6GiB (16.8GB), run=29993-29993msec
> READ: bw=524MiB/s (550MB/s), 524MiB/s-524MiB/s (550MB/s-550MB/s), io=15.6GiB (16.8GB), run=30526-30526msec
> READ: bw=529MiB/s (554MB/s), 529MiB/s-529MiB/s (554MB/s-554MB/s), io=15.6GiB (16.8GB), run=30269-30269msec
> READ: bw=449MiB/s (471MB/s), 449MiB/s-449MiB/s (471MB/s-471MB/s), io=15.6GiB (16.8GB), run=35629-35629msec
>
> commit: 633MiB/s
> READ: bw=668MiB/s (700MB/s), 668MiB/s-668MiB/s (700MB/s-700MB/s), io=15.6GiB (16.8GB), run=23952-23952msec
> READ: bw=589MiB/s (618MB/s), 589MiB/s-589MiB/s (618MB/s-618MB/s), io=15.6GiB (16.8GB), run=27164-27164msec
> READ: bw=638MiB/s (669MB/s), 638MiB/s-638MiB/s (669MB/s-669MB/s), io=15.6GiB (16.8GB), run=25071-25071msec
> READ: bw=714MiB/s (749MB/s), 714MiB/s-714MiB/s (749MB/s-749MB/s), io=15.6GiB (16.8GB), run=22409-22409msec
> READ: bw=600MiB/s (629MB/s), 600MiB/s-600MiB/s (629MB/s-629MB/s), io=15.6GiB (16.8GB), run=26669-26669msec
> READ: bw=592MiB/s (621MB/s), 592MiB/s-592MiB/s (621MB/s-621MB/s), io=15.6GiB (16.8GB), run=27036-27036msec
> READ: bw=691MiB/s (725MB/s), 691MiB/s-691MiB/s (725MB/s-725MB/s), io=15.6GiB (16.8GB), run=23150-23150msec
> READ: bw=569MiB/s (596MB/s), 569MiB/s-569MiB/s (596MB/s-596MB/s), io=15.6GiB (16.8GB), run=28142-28142msec
> READ: bw=563MiB/s (590MB/s), 563MiB/s-563MiB/s (590MB/s-590MB/s), io=15.6GiB (16.8GB), run=28429-28429msec
> READ: bw=712MiB/s (746MB/s), 712MiB/s-712MiB/s (746MB/s-746MB/s), io=15.6GiB (16.8GB), run=22478-22478msec
>
> Case 3:
> This commit is also verified by the case of launching camera APP which is
> usually considered as heavy working load on both of memory and IO, which
> shows 12%-24% improvement.
>
>                 ttl = 0         ttl = 50        ttl = 100
> mainline        2267ms          2420ms          2316ms
> commit          1992ms          1806ms          1998ms
>
> case 4:
> androbench has no improvment as well as regression in RD/WR test item
> while make a 3% improvement in sqlite items.
>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
> ---
> change of v2: calculate page's activity via helper function
> change of v3: solve layer violation by move API into mm
> change of v4: keep block clean by removing the page related API
> change of v5: introduce the macros of bio_add_folio/page for read dir.
> change of v6: replace the macro of bio_add_xxx by submit_bio which
>                 iterating the bio_vec before launching bio to block layer
> change of v7: introduce the function bio_set_active_ioprio
>               provide updated test result
> ---
> ---
>  block/Kconfig       | 15 ++++++++++++
>  block/bio.c         | 56 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/bio.h |  1 +
>  3 files changed, 72 insertions(+)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index f1364d1c0d93..2dbcd8d92052 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -228,6 +228,21 @@ config BLOCK_HOLDER_DEPRECATED
>  config BLK_MQ_STACKING
>         bool
>
> +config BLK_CONT_ACT_BASED_IOPRIO
> +       bool "Enable content activity based ioprio"
> +       depends on LRU_GEN
> +       default n
> +       help
> +         This item enable the feature of adjust bio's priority by
> +         calculating its content's activity.
> +         This feature works as a hint of original bio_set_ioprio
> +         which means rt task get no change of its bio->bi_ioprio
> +         while other tasks have the opportunity to raise the ioprio
> +         if the bio take certain numbers of active pages.
> +         The file system should use this by modifying their buffered
> +         read/write/sync function to raise the bio->bi_ioprio before
> +         calling submit_bio.
> +
>  source "block/Kconfig.iosched"
>
>  endif # BLOCK
> diff --git a/block/bio.c b/block/bio.c
> index 816d412c06e9..9e2d1e018c98 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1476,6 +1476,62 @@ void bio_set_pages_dirty(struct bio *bio)
>  }
>  EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
>
> +/*
> + * bio_set_active_ioprio() is helper function for fs to adjust the bio's ioprio via
> + * calculating the content's activity which measured from MGLRU.
> + * The file system should call this function before submit_bio for the buffered
> + * read/write/sync.
> + */
> +#ifdef CONFIG_BLK_CONT_ACT_BASED_IOPRIO
> +void bio_set_active_ioprio(struct bio *bio)
> +{
> +       struct bio_vec bv;
> +       struct bvec_iter iter;
> +       struct page *page;
> +       int class, level, hint;
> +       int activity = 0;
> +       int cnt = 0;
> +
> +       class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
> +       level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio);
> +       hint = IOPRIO_PRIO_HINT(bio->bi_ioprio);
> +       /*apply legacy ioprio policy on RT task*/
> +       if (task_is_realtime(current)) {
> +               bio->bi_ioprio = IOPRIO_PRIO_VALUE_HINT(IOPRIO_CLASS_RT, level, hint);
> +               return;
> +       }
> +       bio_for_each_bvec(bv, bio, iter) {
> +               page = bv.bv_page;
> +               activity += PageWorkingset(page) ? 1 : 0;
> +               cnt++;
> +               if (activity > bio->bi_vcnt / 2) {
> +                       class = IOPRIO_CLASS_RT;
> +                       break;
> +               } else if (activity > bio->bi_vcnt / 4) {
> +                       /*
> +                        * all itered pages are all active so far
> +                        * then raise to RT directly
> +                        */
> +                       if (activity == cnt) {
> +                               class = IOPRIO_CLASS_RT;
> +                               break;
> +                       } else
> +                               class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()),
> +                                               IOPRIO_CLASS_BE);
> +               }
> +       }
> +       if (!class && activity > cnt / 2)
> +               class = IOPRIO_CLASS_RT;
> +       else if (!class && activity > cnt / 4)
> +               class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()), IOPRIO_CLASS_BE);
> +
> +       bio->bi_ioprio = IOPRIO_PRIO_VALUE_HINT(class, level, hint);
> +}
> +#else
> +void bio_set_active_ioprio(struct bio *bio) {}
> +#endif
> +EXPORT_SYMBOL_GPL(bio_set_active_ioprio);
> +
>  /*
>   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
>   * If they are, then fine.  If, however, some pages are clean then they must
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 41d417ee1349..58e33d8b0f5f 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -487,6 +487,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
>  void __bio_release_pages(struct bio *bio, bool mark_dirty);
>  extern void bio_set_pages_dirty(struct bio *bio);
>  extern void bio_check_pages_dirty(struct bio *bio);
> +extern void bio_set_active_ioprio(struct bio *bio);
>
>  extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
>                                struct bio *src, struct bvec_iter *src_iter);
> --
> 2.25.1
>





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux