This is to transfer dirty pages encountered in page reclaim to the flusher threads for writeback. The flusher will piggy back more dirty pages for IO - it's more IO efficient - it helps clean more pages, a good number of them may sit in the same LRU list that is being scanned. To avoid memory allocations at page reclaim, a mempool is created. Background/periodic works will quit automatically, so as to clean the pages under reclaim ASAP. However the sync work can still block us for long time. Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> --- fs/fs-writeback.c | 103 +++++++++++++++++++++++++++++++++- include/linux/backing-dev.h | 2 2 files changed, 102 insertions(+), 3 deletions(-) --- linux-next.orig/fs/fs-writeback.c 2010-09-13 20:06:09.000000000 +0800 +++ linux-next/fs/fs-writeback.c 2010-09-13 20:24:03.000000000 +0800 @@ -30,11 +30,20 @@ #include "internal.h" /* + * When flushing an inode page (for page reclaim), try to piggy back more + * nearby pages for IO efficiency. These pages will have good opportunity + * to be in the same LRU list. + */ +#define WRITE_AROUND_PAGES (1UL << (20 - PAGE_CACHE_SHIFT)) + +/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; + struct inode *inode; + pgoff_t offset; enum writeback_sync_modes sync_mode; unsigned int for_kupdate:1; unsigned int range_cyclic:1; @@ -59,6 +68,27 @@ struct wb_writeback_work { */ int nr_pdflush_threads; +static mempool_t *wb_work_mempool; + +static void *wb_work_alloc(gfp_t gfp_mask, void *pool_data) +{ + /* + * bdi_start_inode_writeback() may be called on page reclaim + */ + if (current->flags & PF_MEMALLOC) + return NULL; + + return kmalloc(sizeof(struct wb_writeback_work), gfp_mask); +} + +static __init int wb_work_init(void) +{ + wb_work_mempool = mempool_create(1024, + wb_work_alloc, mempool_kfree, NULL); + return wb_work_mempool ? 0 : -ENOMEM; +} +fs_initcall(wb_work_init); + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -101,7 +131,7 @@ __bdi_start_writeback(struct backing_dev * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kzalloc(sizeof(*work), GFP_ATOMIC); + work = mempool_alloc(wb_work_mempool, GFP_NOWAIT); if (!work) { if (bdi->wb.task) { trace_writeback_nowork(bdi); @@ -110,6 +140,7 @@ __bdi_start_writeback(struct backing_dev return; } + memset(work, 0, sizeof(*work)); work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; @@ -148,6 +179,55 @@ void bdi_start_background_writeback(stru __bdi_start_writeback(bdi, LONG_MAX, true, true); } +int bdi_start_inode_writeback(struct backing_dev_info *bdi, + struct inode *inode, pgoff_t offset) +{ + struct wb_writeback_work *work; + + spin_lock_bh(&bdi->wb_lock); + list_for_each_entry_reverse(work, &bdi->work_list, list) { + unsigned long end; + if (work->inode != inode) + continue; + end = work->offset + work->nr_pages; + if (work->offset - offset < WRITE_AROUND_PAGES) { + work->nr_pages += work->offset - offset; + work->offset = offset; + inode = NULL; + break; + } else if (offset - end < WRITE_AROUND_PAGES) { + work->nr_pages += offset - end; + inode = NULL; + break; + } else if (offset > work->offset && + offset < end) { + inode = NULL; + break; + } + } + spin_unlock_bh(&bdi->wb_lock); + + if (!inode) + return 0; + + if (!igrab(inode)) + return -ENOENT; + + work = mempool_alloc(wb_work_mempool, GFP_NOWAIT); + if (!work) + return -ENOMEM; + + memset(work, 0, sizeof(*work)); + work->sync_mode = WB_SYNC_NONE; + work->inode = inode; + work->offset = offset; + work->nr_pages = 1; + + bdi_queue_work(inode->i_sb->s_bdi, work); + + return 0; +} + /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -724,6 +804,20 @@ get_next_work_item(struct backing_dev_in return work; } +static long wb_flush_inode(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + pgoff_t start = round_down(work->offset, WRITE_AROUND_PAGES); + pgoff_t end = round_up(work->offset + work->nr_pages, + WRITE_AROUND_PAGES); + int wrote; + + wrote = __filemap_fdatawrite_range(work->inode->i_mapping, + start, end, WB_SYNC_NONE); + iput(work->inode); + return wrote; +} + static long wb_check_background_flush(struct bdi_writeback *wb) { if (over_bground_thresh()) { @@ -796,7 +890,10 @@ long wb_do_writeback(struct bdi_writebac trace_writeback_exec(bdi, work); - wrote += wb_writeback(wb, work); + if (work->inode) + wrote += wb_flush_inode(wb, work); + else + wrote += wb_writeback(wb, work); /* * Notify the caller of completion if this is a synchronous @@ -805,7 +902,7 @@ long wb_do_writeback(struct bdi_writebac if (work->done) complete(work->done); else - kfree(work); + mempool_free(work, wb_work_mempool); } /* --- linux-next.orig/include/linux/backing-dev.h 2010-09-13 19:48:16.000000000 +0800 +++ linux-next/include/linux/backing-dev.h 2010-09-13 20:20:59.000000000 +0800 @@ -106,6 +106,8 @@ void bdi_unregister(struct backing_dev_i int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); void bdi_start_background_writeback(struct backing_dev_info *bdi); +int bdi_start_inode_writeback(struct backing_dev_info *bdi, + struct inode *inode, pgoff_t offset); int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>