Jan, I tried the below combined patch over the ioless one, and find some minor regressions. I studied the thresh=1G/ext3-1dd case in particular and find that nr_writeback and the iostat avgrq-sz drops from time to time. I'll try to bisect the changeset. 3.1.0-rc9-ioless-full-next-20111014+ 3.1.0-rc9-ioless-full-more_io_wait-next-20111014+ ------------------------ ------------------------ 56.47 -0.6% 56.13 thresh=100M/btrfs-10dd-4k-8p-4096M-100M:10-X 56.28 -0.4% 56.07 thresh=100M/btrfs-1dd-4k-8p-4096M-100M:10-X 56.11 -0.1% 56.05 thresh=100M/btrfs-2dd-4k-8p-4096M-100M:10-X 37.86 +1.8% 38.54 thresh=100M/ext3-10dd-4k-8p-4096M-100M:10-X 45.91 +0.7% 46.22 thresh=100M/ext3-1dd-4k-8p-4096M-100M:10-X 41.87 +0.8% 42.19 thresh=100M/ext3-2dd-4k-8p-4096M-100M:10-X 45.68 -0.4% 45.50 thresh=100M/ext4-10dd-4k-8p-4096M-100M:10-X 55.74 -2.2% 54.51 thresh=100M/ext4-1dd-4k-8p-4096M-100M:10-X 46.20 -4.8% 43.98 thresh=100M/xfs-10dd-4k-8p-4096M-100M:10-X 55.72 +0.1% 55.76 thresh=100M/xfs-1dd-4k-8p-4096M-100M:10-X 54.01 -2.0% 52.94 thresh=100M/xfs-2dd-4k-8p-4096M-100M:10-X 55.08 -1.0% 54.52 thresh=1G/btrfs-100dd-4k-8p-4096M-1024M:10-X 55.49 -1.0% 54.94 thresh=1G/btrfs-10dd-4k-8p-4096M-1024M:10-X 55.38 -2.7% 53.91 thresh=1G/btrfs-1dd-4k-8p-4096M-1024M:10-X 36.70 -1.5% 36.15 thresh=1G/ext3-100dd-4k-8p-4096M-1024M:10-X 40.64 -5.9% 38.25 thresh=1G/ext3-10dd-4k-8p-4096M-1024M:10-X 48.65 -6.9% 45.30 thresh=1G/ext3-1dd-4k-8p-4096M-1024M:10-X 49.84 -3.2% 48.23 thresh=1G/ext4-100dd-4k-8p-4096M-1024M:10-X 56.03 -3.3% 54.21 thresh=1G/ext4-10dd-4k-8p-4096M-1024M:10-X 57.42 -2.3% 56.07 thresh=1G/ext4-1dd-4k-8p-4096M-1024M:10-X 45.74 -1.4% 45.12 thresh=1G/xfs-100dd-4k-8p-4096M-1024M:10-X 54.19 -0.5% 53.94 thresh=1G/xfs-10dd-4k-8p-4096M-1024M:10-X 55.93 -0.5% 55.66 thresh=1G/xfs-1dd-4k-8p-4096M-1024M:10-X 2.77 +27.8% 3.54 thresh=1M/btrfs-10dd-4k-8p-4096M-1M:10-X 2.20 -15.5% 1.86 thresh=1M/btrfs-1dd-4k-8p-4096M-1M:10-X 2.42 -1.3% 2.39 thresh=1M/btrfs-2dd-4k-8p-4096M-1M:10-X 28.91 +1.9% 29.47 thresh=1M/ext3-10dd-4k-8p-4096M-1M:10-X 45.02 +1.1% 45.50 thresh=1M/ext3-1dd-4k-8p-4096M-1M:10-X 40.91 +0.4% 41.09 thresh=1M/ext3-2dd-4k-8p-4096M-1M:10-X 31.82 +2.3% 32.56 thresh=1M/ext4-10dd-4k-8p-4096M-1M:10-X 52.33 -0.9% 51.85 thresh=1M/ext4-1dd-4k-8p-4096M-1M:10-X 28.43 +1.2% 28.77 thresh=1M/xfs-10dd-4k-8p-4096M-1M:10-X 52.93 -3.8% 50.90 thresh=1M/xfs-1dd-4k-8p-4096M-1M:10-X 46.87 -0.0% 46.85 thresh=1M/xfs-2dd-4k-8p-4096M-1M:10-X 54.54 -1.3% 53.82 thresh=8M/btrfs-10dd-4k-8p-4096M-8M:10-X 56.60 -1.4% 55.80 thresh=8M/btrfs-1dd-4k-8p-4096M-8M:10-X 56.21 -0.4% 55.96 thresh=8M/btrfs-2dd-4k-8p-4096M-8M:10-X 32.54 +0.2% 32.62 thresh=8M/ext3-10dd-4k-8p-4096M-8M:10-X 46.01 -1.0% 45.55 thresh=8M/ext3-1dd-4k-8p-4096M-8M:10-X 44.13 -0.6% 43.87 thresh=8M/ext3-2dd-4k-8p-4096M-8M:10-X 35.78 -0.4% 35.63 thresh=8M/ext4-10dd-4k-8p-4096M-8M:10-X 55.29 +0.2% 55.38 thresh=8M/ext4-1dd-4k-8p-4096M-8M:10-X 31.21 -0.8% 30.96 thresh=8M/xfs-10dd-4k-8p-4096M-8M:10-X 54.10 -0.3% 53.95 thresh=8M/xfs-1dd-4k-8p-4096M-8M:10-X 46.97 +0.5% 47.20 thresh=8M/xfs-2dd-4k-8p-4096M-8M:10-X 2010.92 -1.1% 1989.70 TOTAL write_bw --- linux-next.orig/fs/fs-writeback.c 2011-10-20 19:26:37.000000000 +0800 +++ linux-next/fs/fs-writeback.c 2011-10-20 20:00:22.000000000 +0800 @@ -234,6 +234,15 @@ static void requeue_io(struct inode *ino list_move(&inode->i_wb_list, &wb->b_more_io); } +/* + * The inode should be retried in an opportunistic way. + */ +static void requeue_io_wait(struct inode *inode, struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + list_move(&inode->i_wb_list, &wb->b_more_io_wait); +} + static void inode_sync_complete(struct inode *inode) { /* @@ -321,6 +330,7 @@ static void queue_io(struct bdi_writebac int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); + list_splice_init(&wb->b_more_io_wait, &wb->b_io); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); trace_writeback_queue_io(wb, work, moved); } @@ -470,7 +480,7 @@ writeback_single_inode(struct inode *ino * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode, wb); + requeue_io_wait(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -478,8 +488,18 @@ writeback_single_inode(struct inode *ino * operations, such as delayed allocation during * submission or metadata updates after data IO * completion. + * + * For the latter case it is very important to give + * the inode another turn on b_more_io instead of + * redirtying it. Constantly moving dirtied_when + * forward will prevent us from ever writing out + * the metadata dirtied in the I/O completion handler. + * + * For files on XFS that constantly get appended to + * calling redirty_tail means they will never get + * their updated i_size written out. */ - redirty_tail(inode, wb); + requeue_io_wait(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -600,7 +620,7 @@ static long writeback_sb_inodes(struct s * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode, wb); + requeue_io_wait(inode, wb); } spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); @@ -637,7 +657,7 @@ static long __writeback_inodes_wb(struct * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ - redirty_tail(inode, wb); + requeue_io_wait(inode, wb); continue; } wrote += writeback_sb_inodes(sb, wb, work); @@ -720,10 +740,10 @@ static long wb_writeback(struct bdi_writ struct wb_writeback_work *work) { unsigned long wb_start = jiffies; - long nr_pages = work->nr_pages; unsigned long oldest_jif; struct inode *inode; long progress; + long total_progress = 0; oldest_jif = jiffies; work->older_than_this = &oldest_jif; @@ -753,11 +773,17 @@ static long wb_writeback(struct bdi_writ if (work->for_background && !over_bground_thresh(wb->bdi)) break; + /* + * Kupdate and background works are special and we want to + * include all inodes that need writing. Livelock avoidance is + * handled by these works yielding to any other work so we are + * safe. + */ if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - work->older_than_this = &oldest_jif; - } + } else if (work->for_background) + oldest_jif = jiffies; trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) @@ -767,6 +793,7 @@ static long wb_writeback(struct bdi_writ else progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb->bdi, work); + total_progress += progress; wb_update_bandwidth(wb, wb_start); @@ -800,7 +827,7 @@ static long wb_writeback(struct bdi_writ } spin_unlock(&wb->list_lock); - return nr_pages - work->nr_pages; + return total_progress; } /* @@ -863,7 +890,7 @@ static long wb_check_old_data_flush(stru expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); - if (time_before(jiffies, expired)) + if (time_before(jiffies, expired) && list_empty(&wb->b_more_io_wait)) return 0; wb->last_old_flush = jiffies; @@ -934,7 +961,11 @@ int bdi_writeback_thread(void *data) { struct bdi_writeback *wb = data; struct backing_dev_info *bdi = wb->bdi; - long pages_written; + long progress; + unsigned int pause = 1; + unsigned int max_pause = dirty_writeback_interval ? + msecs_to_jiffies(dirty_writeback_interval * 10) : + HZ; current->flags |= PF_SWAPWRITE; set_freezable(); @@ -954,12 +985,12 @@ int bdi_writeback_thread(void *data) */ del_timer(&wb->wakeup_timer); - pages_written = wb_do_writeback(wb, 0); + progress = wb_do_writeback(wb, 0); - trace_writeback_pages_written(pages_written); - - if (pages_written) + if (progress) { wb->last_active = jiffies; + pause = 1; + } set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&bdi->work_list) || kthread_should_stop()) { @@ -967,8 +998,11 @@ int bdi_writeback_thread(void *data) continue; } - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + if (!list_empty(&wb->b_more_io_wait) && pause < max_pause) { + schedule_timeout(pause); + pause <<= 1; + } else if (wb_has_dirty_io(wb) && dirty_writeback_interval) + schedule_timeout(max_pause); else { /* * We have nothing to do, so can go sleep without any --- linux-next.orig/include/linux/backing-dev.h 2011-10-20 19:26:37.000000000 +0800 +++ linux-next/include/linux/backing-dev.h 2011-10-20 19:29:39.000000000 +0800 @@ -59,6 +59,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_more_io_wait;/* opportunistic retry io */ spinlock_t list_lock; /* protects the b_* lists */ }; @@ -133,9 +134,10 @@ extern struct list_head bdi_pending_list static inline int wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return !list_empty(&wb->b_dirty) || + !list_empty(&wb->b_io) || + !list_empty(&wb->b_more_io) || + !list_empty(&wb->b_more_io_wait); } static inline void __add_bdi_stat(struct backing_dev_info *bdi, --- linux-next.orig/mm/backing-dev.c 2011-10-20 19:26:37.000000000 +0800 +++ linux-next/mm/backing-dev.c 2011-10-20 19:29:39.000000000 +0800 @@ -74,10 +74,10 @@ static int bdi_debug_stats_show(struct s unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; + unsigned long nr_dirty, nr_io, nr_more_io, nr_more_io_wait; struct inode *inode; - nr_dirty = nr_io = nr_more_io = 0; + nr_dirty = nr_io = nr_more_io = nr_more_io_wait = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; @@ -85,6 +85,8 @@ static int bdi_debug_stats_show(struct s nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; + list_for_each_entry(inode, &wb->b_more_io_wait, i_wb_list) + nr_more_io_wait++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -103,6 +105,7 @@ static int bdi_debug_stats_show(struct s "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" + "b_more_io_wait: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -116,6 +119,7 @@ static int bdi_debug_stats_show(struct s nr_dirty, nr_io, nr_more_io, + nr_more_io_wait, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -651,6 +655,7 @@ static void bdi_wb_init(struct bdi_write INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_more_io_wait); spin_lock_init(&wb->list_lock); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } @@ -718,6 +723,7 @@ void bdi_destroy(struct backing_dev_info list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); + list_splice(&bdi->wb.b_more_io_wait, &dst->b_more_io_wait); spin_unlock(&bdi->wb.list_lock); spin_unlock(&dst->list_lock); } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html