From: Kapil Karkra <kapil.karkra@xxxxxxxxx> Added the necessary plumbing to take the ioprio hints down to the block layer from where they further flow down into the libata. For reads or direct IO, bio_associate_ioprio (invoked from blk_throtl_bio) copies the ioprio from the current io context into the bio in the submit_bio context. For lazy writes, 3 bits from the page_flags are used to record ioprio in every page associated with a particular IO. Since page-flags are scarce, we do this enabling only on 64 bit platforms. We take the ioprio from the current io context and store it into each page in grab_cache_page_write_begin function. the bio_associate_ioprio method walks through all pages and determines the overall best priority to be associated to the bio. The bio carries the io priority further down the IO stack. Signed-off-by: Kapil Karkra <kapil.karkra@xxxxxxxxx> Signed-off-by: Jason B. Akers <jason.b.akers@xxxxxxxxx> --- block/bio.c | 34 ++++++++++++++++++++++++++++++++++ block/blk-throttle.c | 5 +++++ include/linux/bio.h | 1 + include/linux/page-flags.h | 24 ++++++++++++++++++++++++ mm/debug.c | 5 +++++ mm/filemap.c | 18 ++++++++++++++++++ 6 files changed, 87 insertions(+) diff --git a/block/bio.c b/block/bio.c index b93ae04..cc5cc64 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1965,6 +1965,40 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) } EXPORT_SYMBOL(bioset_create); +int bio_associate_ioprio(struct bio *bio) +{ + struct io_context *ioc; + struct bio_vec bv; + struct bvec_iter iter; + int max_ioprio = 0; /* init max_ioprio to 0 (invalid) */ + int advice, ioprio; + + ioc = current->io_context; + if (!ioc) + return -ENOENT; + + /* scan the bio_vecs for this bio and get the highest + * ioprio to use for current + */ + bio_for_each_segment(bv, bio, iter) { + advice = PageGetAdvice(bv.bv_page); + ioprio = IOPRIO_ADVISE(0, 0, advice); + if (ioprio_advice_valid(ioprio)) + max_ioprio = ioprio_best(ioprio, max_ioprio); + } + + /* set max priority found in all bio_vecs */ + bio_set_prio(bio, max_ioprio); + + /* acquire active ref on @ioc and associate + * also handles the read case + */ + bio_associate_ioc(bio,ioc); + bio_set_prio(bio, ioprio_best(ioc->ioprio, max_ioprio)); + + return 0; +} + /** * bioset_create_nobvec - Create a bio_set without bio_vec mempool * @pool_size: Number of bio to cache in the mempool diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9273d09..abc33a5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1484,6 +1484,11 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) struct blkcg *blkcg; bool throttled = false; + /* associate the best ioprio to the bio */ + spin_lock_irq(q->queue_lock); + bio_associate_ioprio(bio); + spin_unlock_irq(q->queue_lock); + /* see throtl_charge_bio() */ if (bio->bi_rw & REQ_THROTTLED) goto out; diff --git a/include/linux/bio.h b/include/linux/bio.h index 8419319..4747c78 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -470,6 +470,7 @@ extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); +int bio_associate_ioprio(struct bio *bio); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e1f5fcd..8811234 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -109,6 +109,11 @@ enum pageflags { #ifdef CONFIG_TRANSPARENT_HUGEPAGE PG_compound_lock, #endif +#ifdef CONFIG_PAGEFLAGS_EXTENDED + PG_ioprio_advice_0, /* 3 flag bits store ioprio advice */ + PG_ioprio_advice_1, + PG_ioprio_advice_2, +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -370,6 +375,25 @@ static inline void ClearPageCompound(struct page *page) #define PG_head_mask ((1L << PG_head)) +/* + * ioprio advise is recorded here + */ +static inline void PageSetAdvice(struct page *page, unsigned int advice) +{ + page->flags = (page->flags | + ((((advice >> 0) & 1) << PG_ioprio_advice_0) | + (((advice >> 1) & 1) << PG_ioprio_advice_1) | + (((advice >> 2) & 1) << PG_ioprio_advice_2))); +} + +static inline int PageGetAdvice(struct page *page) +{ + unsigned int advice = (((page->flags >> PG_ioprio_advice_0) & 1) | + (((page->flags >> PG_ioprio_advice_1) & 1) << 1) | + (((page->flags >> PG_ioprio_advice_2) & 1) << 2)); + return advice; +} + #else /* * Reduce page flag use as much as possible by overlapping diff --git a/mm/debug.c b/mm/debug.c index 5ce45c9..c785b06 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -48,6 +48,11 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_ioprio_advice_0, "ioprio_adv0" }, + {1UL << PG_ioprio_advice_1, "ioprio_adv1" }, + {1UL << PG_ioprio_advice_2, "ioprio_adv2" }, +#endif }; static void dump_flags(unsigned long flags, diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642..f82529d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2438,6 +2438,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, { struct page *page; int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; + struct io_context *ioc; + int advice; + int ioprio; if (flags & AOP_FLAG_NOFS) fgp_flags |= FGP_NOFS; @@ -2448,6 +2451,21 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, if (page) wait_for_stable_page(page); + /* store the ioprio into the page flags */ + if (current && current->io_context) { + ioc = current->io_context; + advice = PageGetAdvice(page); + ioprio = IOPRIO_ADVISE(0, 0, advice); + if (ioprio_advice_valid(ioc->ioprio)) { + if (ioprio_advice_valid(ioprio)) + ioprio = ioprio_best(ioprio, ioc->ioprio); + else + ioprio = ioc->ioprio; + + PageSetAdvice(page, IOPRIO_ADVICE(ioprio)); + } + } + return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); -- To unsubscribe from this list: send the line "unsubscribe linux-ide" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html