> --- linux.orig/fs/fs-writeback.c 2011-07-29 22:14:18.000000000 +0800 > +++ linux/fs/fs-writeback.c 2011-07-31 17:04:25.000000000 +0800 > @@ -618,7 +618,12 @@ static long __writeback_inodes_wb(struct > struct super_block *sb = inode->i_sb; > > if (!grab_super_passive(sb)) { > - requeue_io(inode, wb); > + /* > + * grab_super_passive() may fail consistently due to > + * s_umount being grabbed by someone else. So redirty > + * the inode to avoid busy loop. > + */ > + redirty_tail(inode, wb); > continue; > } > wrote += writeback_sb_inodes(sb, wb, work); Or we could fix it by moving the inode into b_more_io_wait. This avoids introducing possible delays to the inode, as well as makes it possible to eliminate extra sync works by setting the sync works' work->older_than_this to the sync() _syscall_ time instead of the current sync work _execution_ time. Thanks, Fengguang --- Subject: writeback: introduce queue b_more_io_wait Date: Sun Jul 31 18:44:44 CST 2011 Introduce the b_more_io_wait queue to park inodes that for some reason cannot be synced immediately. They will be enqueued at the next b_io refill time and won't be busy retried as b_more_io. The new data flow after this patchset: b_dirty --> b_io --> b_more_io/b_more_io_wait --+ ^ | | | +----------------------------------+ The rational is to address two issues: - the 30s max delay of redirty_tail() may be too long - redirty_tail() may update i_dirtied_when. With b_more_io_wait, we'll be able to avoid extra sync() works by excluding any inodes from being synced if its dirty time is after the sync() _syscall_ time. Cc: Jan Kara <jack@xxxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Cc: Michael Rubin <mrubin@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> --- fs/fs-writeback.c | 10 ++++++++++ include/linux/backing-dev.h | 8 +++++--- mm/backing-dev.c | 10 ++++++++-- 3 files changed, 23 insertions(+), 5 deletions(-) --- linux.orig/fs/fs-writeback.c 2011-07-31 18:39:19.000000000 +0800 +++ linux/fs/fs-writeback.c 2011-07-31 19:03:28.000000000 +0800 @@ -220,6 +220,15 @@ static void requeue_io(struct inode *ino list_move(&inode->i_wb_list, &wb->b_more_io); } +/* + * The inode should be retried in an opportunistic way. + */ +static void requeue_io_wait(struct inode *inode, struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + list_move(&inode->i_wb_list, &wb->b_more_io_wait); +} + static void inode_sync_complete(struct inode *inode) { /* @@ -307,6 +316,7 @@ static void queue_io(struct bdi_writebac int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); + list_splice_init(&wb->b_more_io_wait, &wb->b_io); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); trace_writeback_queue_io(wb, older_than_this, moved); } --- linux.orig/include/linux/backing-dev.h 2011-07-31 18:39:20.000000000 +0800 +++ linux/include/linux/backing-dev.h 2011-07-31 18:42:18.000000000 +0800 @@ -58,6 +58,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_more_io_wait;/* opportunistic retry io */ spinlock_t list_lock; /* protects the b_* lists */ }; @@ -121,9 +122,10 @@ extern struct list_head bdi_pending_list static inline int wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return !list_empty(&wb->b_dirty) || + !list_empty(&wb->b_io) || + !list_empty(&wb->b_more_io) || + !list_empty(&wb->b_more_io_wait); } static inline void __add_bdi_stat(struct backing_dev_info *bdi, --- linux.orig/mm/backing-dev.c 2011-07-31 18:39:19.000000000 +0800 +++ linux/mm/backing-dev.c 2011-07-31 18:44:26.000000000 +0800 @@ -74,10 +74,10 @@ static int bdi_debug_stats_show(struct s unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; + unsigned long nr_dirty, nr_io, nr_more_io, nr_more_io_wait; struct inode *inode; - nr_dirty = nr_io = nr_more_io = 0; + nr_dirty = nr_io = nr_more_io = nr_more_io_wait = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; @@ -85,6 +85,8 @@ static int bdi_debug_stats_show(struct s nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; + list_for_each_entry(inode, &wb->b_more_io_wait, i_wb_list) + nr_more_io_wait++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -102,6 +104,7 @@ static int bdi_debug_stats_show(struct s "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" + "b_more_io_wait: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -114,6 +117,7 @@ static int bdi_debug_stats_show(struct s nr_dirty, nr_io, nr_more_io, + nr_more_io_wait, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -637,6 +641,7 @@ static void bdi_wb_init(struct bdi_write INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_more_io_wait); spin_lock_init(&wb->list_lock); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } @@ -702,6 +707,7 @@ void bdi_destroy(struct backing_dev_info list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); + list_splice(&bdi->wb.b_more_io_wait, &dst->b_more_io_wait); spin_unlock(&bdi->wb.list_lock); spin_unlock(&dst->list_lock); } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html