Add a struct bdi_writeback for tracking and handling dirty IO. This is in preparation for adding > 1 flusher task per bdi. Signed-off-by: Jens Axboe <jens.axboe@xxxxxxxxxx> --- fs/fs-writeback.c | 145 ++++++++++++++++++++++++++---------------- include/linux/backing-dev.h | 40 +++++++----- mm/backing-dev.c | 128 ++++++++++++++++++++++++++++++-------- 3 files changed, 215 insertions(+), 98 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3b748e7..a238480 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -46,9 +46,11 @@ int nr_pdflush_threads; * unless they implement their own. Which is somewhat inefficient, as this * may prevent concurrent writeback against multiple devices. */ -static int writeback_acquire(struct backing_dev_info *bdi) +static int writeback_acquire(struct bdi_writeback *wb) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + struct backing_dev_info *bdi = wb->bdi; + + return !test_and_set_bit(wb->nr, &bdi->wb_active); } /** @@ -59,19 +61,40 @@ static int writeback_acquire(struct backing_dev_info *bdi) */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return bdi->wb_active != 0; } /** * writeback_release - relinquish exclusive writeback access against a device. * @bdi: the device's backing_dev_info structure */ -static void writeback_release(struct backing_dev_info *bdi) +static void writeback_release(struct bdi_writeback *wb) { - WARN_ON_ONCE(!writeback_in_progress(bdi)); - bdi->wb_arg.nr_pages = 0; - bdi->wb_arg.sb = NULL; - clear_bit(BDI_pdflush, &bdi->state); + struct backing_dev_info *bdi = wb->bdi; + + wb->nr_pages = 0; + wb->sb = NULL; + clear_bit(wb->nr, &bdi->wb_active); +} + +static void wb_start_writeback(struct bdi_writeback *wb, struct super_block *sb, + long nr_pages, + enum writeback_sync_modes sync_mode) +{ + if (!wb_has_dirty_io(wb)) + return; + + if (writeback_acquire(wb)) { + wb->nr_pages = nr_pages; + wb->sb = sb; + wb->sync_mode = sync_mode; + + /* + * make above store seen before the task is woken + */ + smp_mb(); + wake_up(&wb->wait); + } } int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, @@ -81,22 +104,12 @@ int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, * This only happens the first time someone kicks this bdi, so put * it out-of-line. */ - if (unlikely(!bdi->task)) { + if (unlikely(!bdi->wb.task)) { bdi_add_default_flusher_task(bdi); return 1; } - if (writeback_acquire(bdi)) { - bdi->wb_arg.nr_pages = nr_pages; - bdi->wb_arg.sb = sb; - bdi->wb_arg.sync_mode = sync_mode; - /* - * make above store seen before the task is woken - */ - smp_mb(); - wake_up(&bdi->wait); - } - + wb_start_writeback(&bdi->wb, sb, nr_pages, sync_mode); return 0; } @@ -124,12 +137,12 @@ int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, * older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ -static void bdi_kupdated(struct backing_dev_info *bdi) +static void wb_kupdated(struct bdi_writeback *wb) { unsigned long oldest_jif; long nr_to_write; struct writeback_control wbc = { - .bdi = bdi, + .bdi = wb->bdi, .sync_mode = WB_SYNC_NONE, .older_than_this = &oldest_jif, .nr_to_write = 0, @@ -164,15 +177,19 @@ static inline bool over_bground_thresh(void) global_page_state(NR_UNSTABLE_NFS) >= background_thresh); } -static void bdi_pdflush(struct backing_dev_info *bdi) +static void generic_sync_wb_inodes(struct bdi_writeback *wb, + struct super_block *sb, + struct writeback_control *wbc); + +static void wb_writeback(struct bdi_writeback *wb) { struct writeback_control wbc = { - .bdi = bdi, - .sync_mode = bdi->wb_arg.sync_mode, + .bdi = wb->bdi, + .sync_mode = wb->sync_mode, .older_than_this = NULL, .range_cyclic = 1, }; - long nr_pages = bdi->wb_arg.nr_pages; + long nr_pages = wb->nr_pages; for (;;) { if (wbc.sync_mode == WB_SYNC_NONE && nr_pages <= 0 && @@ -183,7 +200,7 @@ static void bdi_pdflush(struct backing_dev_info *bdi) wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; - generic_sync_bdi_inodes(bdi->wb_arg.sb, &wbc); + generic_sync_wb_inodes(wb, wb->sb, &wbc); nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* * If we ran out of stuff to write, bail unless more_io got set @@ -200,13 +217,13 @@ static void bdi_pdflush(struct backing_dev_info *bdi) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct backing_dev_info *bdi) +int bdi_writeback_task(struct bdi_writeback *wb) { while (!kthread_should_stop()) { unsigned long wait_jiffies; DEFINE_WAIT(wait); - prepare_to_wait(&bdi->wait, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(&wb->wait, &wait, TASK_INTERRUPTIBLE); wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); schedule_timeout(wait_jiffies); try_to_freeze(); @@ -225,13 +242,13 @@ int bdi_writeback_task(struct backing_dev_info *bdi) * pdflush style writeout. * */ - if (writeback_acquire(bdi)) - bdi_kupdated(bdi); + if (writeback_acquire(wb)) + wb_kupdated(wb); else - bdi_pdflush(bdi); + wb_writeback(wb); - writeback_release(bdi); - finish_wait(&bdi->wait, &wait); + writeback_release(wb); + finish_wait(&wb->wait, &wait); } return 0; @@ -252,6 +269,14 @@ void bdi_writeback_all(struct super_block *sb, struct writeback_control *wbc) mutex_unlock(&bdi_lock); } +/* + * We have only a single wb per bdi, so just return that. + */ +static inline struct bdi_writeback *inode_get_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + /** * __mark_inode_dirty - internal function * @inode: inode to mark @@ -350,9 +375,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct bdi_writeback *wb = inode_get_wb(inode); + inode->dirtied_when = jiffies; - list_move(&inode->i_list, - &inode_to_bdi(inode)->b_dirty); + list_move(&inode->i_list, &wb->b_dirty); } } out: @@ -379,16 +405,16 @@ static int write_inode(struct inode *inode, int sync) */ static void redirty_tail(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_get_wb(inode); - if (!list_empty(&bdi->b_dirty)) { + if (!list_empty(&wb->b_dirty)) { struct inode *tail; - tail = list_entry(bdi->b_dirty.next, struct inode, i_list); + tail = list_entry(wb->b_dirty.next, struct inode, i_list); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &bdi->b_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* @@ -396,7 +422,9 @@ static void redirty_tail(struct inode *inode) */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io); + struct bdi_writeback *wb = inode_get_wb(inode); + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -443,11 +471,10 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct backing_dev_info *bdi, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&bdi->b_more_io, bdi->b_io.prev); - move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } /* @@ -608,20 +635,20 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) return __sync_single_inode(inode, wbc); } -void generic_sync_bdi_inodes(struct super_block *sb, - struct writeback_control *wbc) +static void generic_sync_wb_inodes(struct bdi_writeback *wb, + struct super_block *sb, + struct writeback_control *wbc) { const int is_blkdev_sb = sb_is_blkdev_sb(sb); - struct backing_dev_info *bdi = wbc->bdi; const unsigned long start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&bdi->b_io)) - queue_io(bdi, wbc->older_than_this); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); - while (!list_empty(&bdi->b_io)) { - struct inode *inode = list_entry(bdi->b_io.prev, + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); long pages_skipped; @@ -633,7 +660,7 @@ void generic_sync_bdi_inodes(struct super_block *sb, continue; } - if (!bdi_cap_writeback_dirty(bdi)) { + if (!bdi_cap_writeback_dirty(wb->bdi)) { redirty_tail(inode); if (is_blkdev_sb) { /* @@ -655,7 +682,7 @@ void generic_sync_bdi_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; if (!is_blkdev_sb) break; /* Skip a congested fs */ @@ -689,7 +716,7 @@ void generic_sync_bdi_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&bdi->b_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } @@ -697,6 +724,14 @@ void generic_sync_bdi_inodes(struct super_block *sb, /* Leave any unwritten inodes on b_io */ } +void generic_sync_bdi_inodes(struct super_block *sb, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + + generic_sync_wb_inodes(&bdi->wb, sb, wbc); +} + /* * Write out a superblock's list of dirty inodes. A wait will be performed * upon no inodes, all inodes or the final one, depending upon sync_mode. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9f040a9..4acc64e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -24,8 +24,8 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pdflush, /* A pdflush thread is working this device */ BDI_pending, /* On its way to being activated */ + BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ BDI_unused, /* Available bits start here */ @@ -41,15 +41,23 @@ enum bdi_stat_item { #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) -struct bdi_writeback_arg { - unsigned long nr_pages; - struct super_block *sb; +struct bdi_writeback { + struct backing_dev_info *bdi; /* our parent bdi */ + unsigned int nr; + + struct task_struct *task; /* writeback task */ + wait_queue_head_t wait; + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + + unsigned long nr_pages; + struct super_block *sb; enum writeback_sync_modes sync_mode; }; struct backing_dev_info { struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ @@ -66,14 +74,11 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - struct device *dev; + struct bdi_writeback wb; /* default writeback info for this bdi */ + unsigned long wb_active; /* bitmap of active tasks */ + unsigned long wb_mask; /* number of registered tasks */ - struct task_struct *task; /* writeback task */ - wait_queue_head_t wait; - struct bdi_writeback_arg wb_arg; /* protected by BDI_pdflush */ - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ + struct device *dev; #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; @@ -90,18 +95,19 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, long nr_pages, enum writeback_sync_modes sync_mode); -int bdi_writeback_task(struct backing_dev_info *bdi); +int bdi_writeback_task(struct bdi_writeback *wb); void bdi_writeback_all(struct super_block *sb, struct writeback_control *wbc); void bdi_add_default_flusher_task(struct backing_dev_info *bdi); +int bdi_has_dirty_io(struct backing_dev_info *bdi); extern struct mutex bdi_lock; extern struct list_head bdi_list; -static inline int bdi_has_dirty_io(struct backing_dev_info *bdi) +static inline int wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&bdi->b_dirty) || - !list_empty(&bdi->b_io) || - !list_empty(&bdi->b_more_io); + return !list_empty(&wb->b_dirty) || + !list_empty(&wb->b_io) || + !list_empty(&wb->b_more_io); } static inline void __add_bdi_stat(struct backing_dev_info *bdi, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0df8079..28c6a7d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -199,10 +199,46 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + init_waitqueue_head(&wb->wait); + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static int wb_assign_nr(struct backing_dev_info *bdi, struct bdi_writeback *wb) +{ + set_bit(0, &bdi->wb_mask); + wb->nr = 0; + return 0; +} + +static void bdi_put_wb(struct backing_dev_info *bdi, struct bdi_writeback *wb) +{ + clear_bit(wb->nr, &bdi->wb_mask); + clear_bit(BDI_wb_alloc, &bdi->state); +} + +static struct bdi_writeback *bdi_new_wb(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + set_bit(BDI_wb_alloc, &bdi->state); + wb = &bdi->wb; + wb_assign_nr(bdi, wb); + return wb; +} + static int bdi_start_fn(void *ptr) { - struct backing_dev_info *bdi = ptr; + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; struct task_struct *tsk = current; + int ret; /* * Add us to the active bdi_list @@ -226,7 +262,15 @@ static int bdi_start_fn(void *ptr) smp_mb__after_clear_bit(); wake_up_bit(&bdi->state, BDI_pending); - return bdi_writeback_task(bdi); + ret = bdi_writeback_task(wb); + + bdi_put_wb(bdi, wb); + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); } static void bdi_flush_io(struct backing_dev_info *bdi) @@ -244,11 +288,12 @@ static void bdi_flush_io(struct backing_dev_info *bdi) static int bdi_forker_task(void *ptr) { - struct backing_dev_info *me = ptr; + struct bdi_writeback *me = ptr; DEFINE_WAIT(wait); for (;;) { struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; /* * Do this periodically, like kupdated() did before. @@ -259,8 +304,8 @@ static int bdi_forker_task(void *ptr) * Temporary measure, we want to make sure we don't see * dirty data on the default backing_dev_info */ - if (bdi_has_dirty_io(me)) - bdi_flush_io(me); + if (wb_has_dirty_io(me)) + bdi_flush_io(me->bdi); prepare_to_wait(&me->wait, &wait, TASK_INTERRUPTIBLE); @@ -271,7 +316,7 @@ static int bdi_forker_task(void *ptr) * a thread registered. If so, set that up. */ list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { - if (bdi->task || !bdi_has_dirty_io(bdi)) + if (bdi->wb.task || !bdi_has_dirty_io(bdi)) continue; bdi_add_default_flusher_task(bdi); @@ -296,17 +341,22 @@ static int bdi_forker_task(void *ptr) list_del_init(&bdi->bdi_list); mutex_unlock(&bdi_lock); - BUG_ON(bdi->task); + wb = bdi_new_wb(bdi); + if (!wb) + goto readd_flush; - bdi->task = kthread_run(bdi_start_fn, bdi, "bdi-%s", + wb->task = kthread_run(bdi_start_fn, wb, "bdi-%s", dev_name(bdi->dev)); + /* * If task creation fails, then readd the bdi to * the pending list and force writeout of the bdi * from this forker thread. That will free some memory * and we can try again. */ - if (!bdi->task) { + if (!wb->task) { + bdi_put_wb(bdi, wb); +readd_flush: /* * Add this 'bdi' to the back, so we get * a chance to flush other bdi's to free @@ -324,8 +374,18 @@ static int bdi_forker_task(void *ptr) return 0; } +/* + * Add a new flusher task that gets created for any bdi + * that has dirty data pending writeout + */ void bdi_add_default_flusher_task(struct backing_dev_info *bdi) { + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * Someone already marked this pending for task creation + */ if (test_and_set_bit(BDI_pending, &bdi->state)) return; @@ -333,7 +393,7 @@ void bdi_add_default_flusher_task(struct backing_dev_info *bdi) list_move_tail(&bdi->bdi_list, &bdi_pending_list); mutex_unlock(&bdi_lock); - wake_up(&default_backing_dev_info.wait); + wake_up(&default_backing_dev_info.wb.wait); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -366,13 +426,23 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, * on-demand when they need it. */ if (bdi_cap_flush_forker(bdi)) { - bdi->task = kthread_run(bdi_forker_task, bdi, "bdi-%s", + struct bdi_writeback *wb; + + wb = bdi_new_wb(bdi); + if (!wb) { + ret = -ENOMEM; + goto remove_err; + } + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", dev_name(dev)); - if (!bdi->task) { + if (!wb->task) { + bdi_put_wb(bdi, wb); + ret = -ENOMEM; +remove_err: mutex_lock(&bdi_lock); list_del(&bdi->bdi_list); mutex_unlock(&bdi_lock); - ret = -ENOMEM; goto exit; } } @@ -395,28 +465,37 @@ static int sched_wait(void *word) return 0; } +/* + * Remove bdi from global list and shutdown any threads we have running + */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { + if (!bdi_cap_writeback_dirty(bdi)) + return; + /* * If setup is pending, wait for that to complete first */ wait_on_bit(&bdi->state, BDI_pending, sched_wait, TASK_UNINTERRUPTIBLE); + /* + * Make sure nobody finds us on the bdi_list anymore + */ mutex_lock(&bdi_lock); list_del(&bdi->bdi_list); mutex_unlock(&bdi_lock); + + /* + * Finally, kill the kernel thread + */ + kthread_stop(bdi->wb.task); } void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { - if (!bdi_cap_flush_forker(bdi)) { + if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); - if (bdi->task) { - kthread_stop(bdi->task); - bdi->task = NULL; - } - } bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -433,11 +512,10 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; - init_waitqueue_head(&bdi->wait); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->b_io); - INIT_LIST_HEAD(&bdi->b_dirty); - INIT_LIST_HEAD(&bdi->b_more_io); + bdi->wb_mask = bdi->wb_active = 0; + + bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -462,9 +540,7 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - WARN_ON(!list_empty(&bdi->b_dirty)); - WARN_ON(!list_empty(&bdi->b_io)); - WARN_ON(!list_empty(&bdi->b_more_io)); + WARN_ON(bdi_has_dirty_io(bdi)); bdi_unregister(bdi); -- 1.6.3.rc0.1.gf800 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html