On Tue, Apr 07 2009, Jens Axboe wrote: > BTW, with the increased number of sync IO and unplugging, it makes sense > to soon look into some finer granularity of plugging. If we didn't have > so many single page submission paths it would not be as big a problem, > but we do. And since they still persist so many years after we added > functionality to pass bigger IOs, it likely wont be much better in the > future either. > > So we can either look into doing per io context plugging, or doing > something similar to: > > plugctx = blk_get_plug_context(); > ... > submit_bio_plug(rw, bio, plugctx); > ... > submit_bio_plug(rw, bio, plugctx); > ... > blk_submit_plug_context(plugctx); > > and pass that down through wbc, perhaps. Dunno, just a thought. > Basically a work-around for not having a dedicated writepages() that > does the right thing (ext3 anyone?). Here's a quick mockup. It compiles, but that's about all the usage it has seen so far :-) diff --git a/block/blk-core.c b/block/blk-core.c index 43fdedc..5cf416c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1567,6 +1567,17 @@ void submit_bio(int rw, struct bio *bio) } EXPORT_SYMBOL(submit_bio); +void submit_bio_plug(int rw, struct bio *bio, struct blk_plug_ctx *ctx) +{ + if (ctx) { + bio->bi_rw |= rw; + bio->bi_next = ctx->bio_list; + ctx->bio_list = bio; + } else + submit_bio(rw, bio); +} +EXPORT_SYMBOL(submit_bio_plug); + /** * blk_rq_check_limits - Helper function to check a request for the queue limit * @q: the queue diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 012f065..e4313e3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -101,6 +101,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; + ret->plug_ctx.bio_list = NULL; + ret->plug_ctx.state = 0; } return ret; @@ -171,6 +173,55 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc) } EXPORT_SYMBOL(copy_io_context); +struct blk_plug_ctx *blk_get_plug_context(void) +{ + struct io_context *ioc; + + ioc = current_io_context(GFP_ATOMIC, -1); + if (!ioc) + return NULL; + + if (!test_and_set_bit_lock(0, &ioc->plug_ctx.state)) + return &ioc->plug_ctx; + + return NULL; +} + +static void __blk_submit_plug_context(struct blk_plug_ctx *ctx) +{ + struct block_device *bdev = NULL; + struct bio *bio; + + while ((bio = ctx->bio_list) != NULL) { + ctx->bio_list = bio->bi_next; + bio->bi_next = NULL; + + if (bdev && bdev != bio->bi_bdev) + blk_unplug(bdev_get_queue(bdev)); + + if (bio_unplug(bio)) + bdev = bio->bi_bdev; + + bio->bi_flags &= ~(1 << BIO_RW_UNPLUG); + + submit_bio(bio->bi_rw, bio); + } +} + +void blk_submit_plug_context(struct blk_plug_ctx *ctx) +{ + if (ctx) { + __blk_submit_plug_context(ctx); + clear_bit_unlock(0, &ctx->state); + } +} + +void blk_flush_plug_context(struct blk_plug_ctx *ctx) +{ + if (ctx) + __blk_submit_plug_context(ctx); +} + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/fs/buffer.c b/fs/buffer.c index 6e35762..2ed21b8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1698,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh_plug(write_op, bh, wbc->plug); nr_underway++; } bh = next; @@ -2884,8 +2884,10 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) bio_put(bio); } -int submit_bh(int rw, struct buffer_head * bh) +static int __submit_bh(int rw, struct buffer_head * bh, + struct blk_plug_ctx *ctx) { + gfp_t gfp = ctx ? GFP_ATOMIC : GFP_NOIO; struct bio *bio; int ret = 0; @@ -2910,7 +2912,12 @@ int submit_bh(int rw, struct buffer_head * bh) * from here on down, it's all bio -- do the initial mapping, * submit_bio -> generic_make_request may further map this bio around */ - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(gfp, 1); + if (!bio) { + blk_flush_plug_context(ctx); + bio_alloc(GFP_NOIO, 1); + ctx = NULL; + } bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; @@ -2926,7 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_private = bh; bio_get(bio); - submit_bio(rw, bio); + + submit_bio_plug(rw, bio, ctx); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -2935,6 +2943,16 @@ int submit_bh(int rw, struct buffer_head * bh) return ret; } +int submit_bh(int rw, struct buffer_head *bh) +{ + return __submit_bh(rw, bh, NULL); +} + +int submit_bh_plug(int rw, struct buffer_head *bh, struct blk_plug_ctx *ctx) +{ + return __submit_bh(rw, bh, ctx); +} + /** * ll_rw_block: low-level access to block devices (DEPRECATED) * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7b73bb8..a8eec18 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -183,6 +183,7 @@ void __lock_buffer(struct buffer_head *bh); void ll_rw_block(int, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int submit_bh(int, struct buffer_head *); +int submit_bh_plug(int, struct buffer_head *, struct blk_plug_ctx *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); diff --git a/include/linux/fs.h b/include/linux/fs.h index bce40a2..8a0c4b5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2117,7 +2117,9 @@ extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); #ifdef CONFIG_BLOCK struct bio; +struct blk_plug_ctx; extern void submit_bio(int, struct bio *); +extern void submit_bio_plug(int, struct bio *, struct blk_plug_ctx *); extern int bdev_read_only(struct block_device *); #endif extern int set_blocksize(struct block_device *, int); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 08b987b..38c8a2c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -3,6 +3,7 @@ #include <linux/radix-tree.h> #include <linux/rcupdate.h> +#include <linux/list.h> /* * This is the per-process anticipatory I/O scheduler state. @@ -59,6 +60,11 @@ struct cfq_io_context { struct rcu_head rcu_head; }; +struct blk_plug_ctx { + struct bio *bio_list; + unsigned long state; +}; + /* * I/O subsystem state of the associated processes. It is refcounted * and kmalloc'ed. These could be shared between processes. @@ -83,6 +89,8 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; void *ioc_data; + + struct blk_plug_ctx plug_ctx; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) @@ -105,7 +113,17 @@ void exit_io_context(void); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); void copy_io_context(struct io_context **pdst, struct io_context **psrc); +struct blk_plug_ctx *blk_get_plug_context(void); +void blk_submit_plug_context(struct blk_plug_ctx *); +void blk_flush_plug_context(struct blk_plug_ctx *); #else +static inline void blk_submit_plug_context(struct blk_plug_ctx *ctx) +{ +} +static inline struct blk_plug_ctx *blk_get_plug_context(void) +{ + return NULL; +} static inline void exit_io_context(void) { } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9344547..8b5c14a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/fs.h> +#include <linux/iocontext.h> struct backing_dev_info; @@ -40,6 +41,7 @@ enum writeback_sync_modes { struct writeback_control { struct backing_dev_info *bdi; /* If !NULL, only write back this queue */ + struct blk_plug_ctx *plug; enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ diff --git a/mm/filemap.c b/mm/filemap.c index 2e2d38e..d521830 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -218,7 +218,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc.plug = blk_get_plug_context(); ret = do_writepages(mapping, &wbc); + blk_submit_plug_context(wbc.plug); return ret; } -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html