On Fri, Jan 26, 2024 at 5:36 PM Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote: > > On Fri, Jan 26, 2024 at 05:28:58PM +0800, Zhaoyang Huang wrote: > > On Fri, Jan 26, 2024 at 4:55 PM Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote: > > > > > > On Fri, Jan 26, 2024 at 03:59:48PM +0800, Zhaoyang Huang wrote: > > > > loop more mm and fs guys for more comments > > > > > > I agree with everything Damien said. But also ... > > ok, I will find a way to solve this problem. > > > > > > > > +bool BIO_ADD_FOLIO(struct bio *bio, struct folio *folio, size_t len, > > > > > + size_t off) > > > > > > You don't add any users of these functions. It's hard to assess whether > > > this is the right API when there are no example users. > > Actually, the code has been tested on ext4 and f2fs by patchv2 on a > > v6.6 6GB android system where I get the test result posted on the > > commit message. These APIs is to keep block layer clean and wrap > > things up for fs. > > well, where's patch v2? i don't see it in my inbox. i'm not going > to go hunting around the email lists for it. this is not good enough. > > > > why are BIO_ADD_PAGE and BIO_ADD_FOLIO so very different from each > > > other? > > These two API just repeat the same thing that bio_add_page and > > bio_add_folio do. > > what? > > here's the patch you sent. these two functions do wildly different > things: > > +bool BIO_ADD_FOLIO(struct bio *bio, struct folio *folio, size_t len, > + size_t off) > +{ > + int class, level, hint, activity; > + > + if (len > UINT_MAX || off > UINT_MAX) > + return false; > + > + class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); > + level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio); > + hint = IOPRIO_PRIO_HINT(bio->bi_ioprio); > + activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio); > + > + activity += (bio->bi_vcnt + 1 <= IOPRIO_NR_ACTIVITY && > + PageWorkingset(&folio->page)) ? 1 : 0; > + if (activity >= bio->bi_vcnt / 2) > + class = IOPRIO_CLASS_RT; > + else if (activity >= bio->bi_vcnt / 4) > + class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()), IOPRIO_CLASS_BE); > + > + bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity); > + > + return bio_add_page(bio, &folio->page, len, off) > 0; > +} > + > +int BIO_ADD_PAGE(struct bio *bio, struct page *page, > + unsigned int len, unsigned int offset) > +{ > + int class, level, hint, activity; > + > + if (bio_add_page(bio, page, len, offset) > 0) { > + class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); > + level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio); > + hint = IOPRIO_PRIO_HINT(bio->bi_ioprio); > + activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio); > + activity += (bio->bi_vcnt <= IOPRIO_NR_ACTIVITY && PageWorkingset(page)) ? 1 : 0; > + bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity); > + } > + > + return len; > +} > > did you change one and forget to change the other? Sorry for missing you in the list. Please find below patchv2 where all activity calculation is located within _bio_add_page which aims at avoiding iterating the bio->bvec before submit_bio. This is rejected by Jens as it introduces page operation in the block layer. block/Kconfig | 8 ++++++++ block/bio.c | 10 ++++++++++ block/blk-mq.c | 21 +++++++++++++++++++++ fs/buffer.c | 6 ++++++ include/linux/buffer_head.h | 1 + include/uapi/linux/ioprio.h | 20 +++++++++++++++----- 6 files changed, 61 insertions(+), 5 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index f1364d1c0d93..8d6075575eae 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -228,6 +228,14 @@ config BLOCK_HOLDER_DEPRECATED config BLK_MQ_STACKING bool +config CONTENT_ACT_BASED_IOPRIO + bool "Enable content activity based ioprio" + depends on LRU_GEN + default y + help + This item enable the feature of adjust bio's priority by + calculating its content's activity. + source "block/Kconfig.iosched" endif # BLOCK diff --git a/block/bio.c b/block/bio.c index 816d412c06e9..1228e2a4940f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -24,6 +24,7 @@ #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" +#include "blk-ioprio.h" #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 @@ -1069,12 +1070,21 @@ EXPORT_SYMBOL_GPL(bio_add_zone_append_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { + int class, level, hint, activity; + + class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio); + hint = IOPRIO_PRIO_HINT(bio->bi_ioprio); + activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio); + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; + activity += bio_page_if_active(bio, page, IOPRIO_NR_ACTIVITY); + bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity); } EXPORT_SYMBOL_GPL(__bio_add_page); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1fafd54dce3c..05cdd3adde94 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2939,6 +2939,26 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, return rq; } +#ifdef CONFIG_CONTENT_ACT_BASED_IOPRIO +static void bio_set_ioprio(struct bio *bio) +{ + int class, level, hint, activity; + + class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio); + hint = IOPRIO_PRIO_HINT(bio->bi_ioprio); + activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio); + + if (activity >= bio->bi_vcnt / 2) + class = IOPRIO_CLASS_RT; + else if (activity >= bio->bi_vcnt / 4) + class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()), IOPRIO_CLASS_BE); + + bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity); + + blkcg_set_ioprio(bio); +} +#else static void bio_set_ioprio(struct bio *bio) { /* Nobody set ioprio so far? Initialize it based on task's nice value */ @@ -2946,6 +2966,7 @@ static void bio_set_ioprio(struct bio *bio) bio->bi_ioprio = get_current_ioprio(); blkcg_set_ioprio(bio); } +#endif /** * blk_mq_submit_bio - Create and send a request to block device. diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d..b15bff481706 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2832,6 +2832,12 @@ void submit_bh(blk_opf_t opf, struct buffer_head *bh) } EXPORT_SYMBOL(submit_bh); +int bio_page_if_active(struct bio *bio, struct page *page, unsigned short limit) +{ + return (bio->bi_vcnt <= limit && PageWorkingset(page)) ? 1 : 0; +} +EXPORT_SYMBOL(bio_page_if_active); + void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 44e9de51eedf..9a374f5965ec 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -248,6 +248,7 @@ int bh_uptodate_or_lock(struct buffer_head *bh); int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); +int bio_page_if_active(struct bio *bio, struct page *page, unsigned short limit); /* * Generic address_space_operations implementations for buffer_head-backed diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index bee2bdb0eedb..d1c6081e796b 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -71,12 +71,18 @@ enum { * class and level. */ #define IOPRIO_HINT_SHIFT IOPRIO_LEVEL_NR_BITS -#define IOPRIO_HINT_NR_BITS 10 +#define IOPRIO_HINT_NR_BITS 3 #define IOPRIO_NR_HINTS (1 << IOPRIO_HINT_NR_BITS) #define IOPRIO_HINT_MASK (IOPRIO_NR_HINTS - 1) #define IOPRIO_PRIO_HINT(ioprio) \ (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_HINT_MASK) +#define IOPRIO_ACTIVITY_SHIFT (IOPRIO_HINT_NR_BITS + IOPRIO_LEVEL_NR_BITS) +#define IOPRIO_ACTIVITY_NR_BITS 7 +#define IOPRIO_NR_ACTIVITY (1 << IOPRIO_ACTIVITY_NR_BITS) +#define IOPRIO_ACTIVITY_MASK (IOPRIO_NR_ACTIVITY - 1) +#define IOPRIO_PRIO_ACTIVITY(ioprio) \ + (((ioprio) >> IOPRIO_ACTIVITY_SHIFT) & IOPRIO_ACTIVITY_MASK) /* * I/O hints. */ @@ -108,20 +114,24 @@ enum { * Return an I/O priority value based on a class, a level and a hint. */ static __always_inline __u16 ioprio_value(int prioclass, int priolevel, - int priohint) + int priohint, int activity) { if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) || IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) || - IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS)) + IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS) || + IOPRIO_BAD_VALUE(activity, IOPRIO_NR_ACTIVITY)) return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; return (prioclass << IOPRIO_CLASS_SHIFT) | + (activity << IOPRIO_ACTIVITY_SHIFT) | (priohint << IOPRIO_HINT_SHIFT) | priolevel; } #define IOPRIO_PRIO_VALUE(prioclass, priolevel) \ - ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE) + ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE, 0) #define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint) \ - ioprio_value(prioclass, priolevel, priohint) + ioprio_value(prioclass, priolevel, priohint, 0) +#define IOPRIO_PRIO_VALUE_ACTIVITY(prioclass, priolevel, priohint, activity) \ + ioprio_value(prioclass, priolevel, priohint, activity) #endif /* _UAPI_LINUX_IOPRIO_H */ > > > These white spaces are trimmed by vim, I will change them back in next version. > > vim doesn't do that by default. >