On Sun, Jun 19, 2011 at 11:35:36PM +0800, Christoph Hellwig wrote: > I'd rather see this earlier in the series, before the write bandwith > estimation ones, as it's groundwork independ of those changes. OK, done. > > + if (wrote) { > > + if (jiffies - start_time > HZ / 10UL) > > + break; > > + if (work->nr_pages <= 0) > > + break; > > + } > > This code probably wants some comments. It's duplicated tests as in writeback_sb_inodes(), which have the comments /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ I'll add a simple comment here: /* refer to the same tests at the end of writeback_sb_inodes */ > > static void bdi_flush_io(struct backing_dev_info *bdi) > > { > > - struct writeback_control wbc = { > > - .sync_mode = WB_SYNC_NONE, > > - .older_than_this = NULL, > > - .range_cyclic = 1, > > - .nr_to_write = 1024, > > - }; > > - > > - writeback_inodes_wb(&bdi->wb, &wbc); > > + writeback_inodes_wb(&bdi->wb, 1024); > > } > > At this point you could probably also kill the bdi_flush_io wrapper, and > just call writeback_inodes_wb directly. Fair enough. > A comment on the 1024 pages to > write would be nice, if you remember it from poking the code. I can't > find any good explanation for it offhand. I comment it with Hopefully 1024 is large enough for efficient IO. Thanks, Fengguang --- Subject: writeback: make writeback_control.nr_to_write straight Date: Wed May 04 19:54:37 CST 2011 Pass struct wb_writeback_work all the way down to writeback_sb_inodes(), and initialize the struct writeback_control there. struct writeback_control is basically designed to control writeback of a single file, but we keep abuse it for writing multiple files in writeback_sb_inodes() and its callers. It immediately clean things up, e.g. suddenly wbc.nr_to_write vs work->nr_pages starts to make sense, and instead of saving and restoring pages_skipped in writeback_sb_inodes it can always start with a clean zero value. It also makes a neat IO pattern change: large dirty files are now written in the full 4MB writeback chunk size, rather than whatever remained quota in wbc->nr_to_write. Acked-by: Jan Kara <jack@xxxxxxx> Proposed-by: Christoph Hellwig <hch@xxxxxxxxxxxxx> Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> --- fs/btrfs/extent_io.c | 2 fs/fs-writeback.c | 196 ++++++++++++++++------------- include/linux/writeback.h | 6 include/trace/events/writeback.h | 39 ++++- mm/backing-dev.c | 17 -- mm/page-writeback.c | 17 -- 6 files changed, 148 insertions(+), 129 deletions(-) change set: - move writeback_control from wb_writeback() down to writeback_sb_inodes() - change return value of writeback_sb_inodes()/__writeback_inodes_wb() to the number of pages and/or inodes written - move writeback_control.older_than_this to struct wb_writeback_work - remove writeback_control.inodes_written - remove wbc_writeback_* trace events for now --- linux-next.orig/fs/fs-writeback.c 2011-06-19 23:57:41.000000000 +0800 +++ linux-next/fs/fs-writeback.c 2011-06-20 00:08:00.000000000 +0800 @@ -30,11 +30,21 @@ #include "internal.h" /* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024L + +/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -472,7 +482,6 @@ writeback_single_inode(struct inode *ino * No need to add it back to the LRU. */ list_del_init(&inode->i_wb_list); - wbc->inodes_written++; } } inode_sync_complete(inode); @@ -506,6 +515,31 @@ static bool pin_sb_for_writeback(struct return false; } +static long writeback_chunk_size(struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else + pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -513,18 +547,30 @@ static bool pin_sb_for_writeback(struct * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging @@ -539,7 +585,7 @@ static int writeback_sb_inodes(struct su * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -553,12 +599,18 @@ static int writeback_sb_inodes(struct su requeue_io(inode, wb); continue; } - __iget(inode); + write_chunk = writeback_chunk_size(work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + writeback_single_inode(inode, wb, &wbc); - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wb, wbc); - if (wbc->pages_skipped != pages_skipped) { + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. @@ -570,17 +622,25 @@ static int writeback_sb_inodes(struct su iput(inode); cond_resched(); spin_lock(&wb->list_lock); - if (wbc->nr_to_write <= 0) - return 1; + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (jiffies - start_time > HZ / 10UL) + break; + if (work->nr_pages <= 0) + break; + } } - /* b_io is empty */ - return 1; + return wrote; } -static void __writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -590,33 +650,37 @@ static void __writeback_inodes_wb(struct requeue_io(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (jiffies - start_time > HZ / 10UL) + break; + if (work->nr_pages <= 0) + break; + } } /* Leave any unwritten inodes on b_io */ + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + }; + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - __writeback_inodes_wb(wb, wbc); + queue_io(wb, NULL); + __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); -} -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} static inline bool over_bground_thresh(void) { @@ -646,42 +710,13 @@ static inline bool over_bground_thresh(v static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .tagged_writepages = work->tagged_writepages, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; - - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * writeback_sb_inodes() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) - write_chunk = LONG_MAX; + long progress; oldest_jif = jiffies; - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; spin_lock(&wb->list_lock); for (;;) { @@ -711,24 +746,17 @@ static long wb_writeback(struct bdi_writ if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; } - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; - wbc.inodes_written = 0; - - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, wbc.older_than_this); + queue_io(wb, work->older_than_this); if (work->sb) - writeback_sb_inodes(work->sb, wb, &wbc, true); + progress = writeback_sb_inodes(work->sb, wb, work); else - __writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); - - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); /* * Did we write something? Try for more @@ -738,9 +766,7 @@ static long wb_writeback(struct bdi_writ * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write < write_chunk) - continue; - if (wbc.inodes_written) + if (progress) continue; /* * No more inodes for IO, bail @@ -753,8 +779,8 @@ static long wb_writeback(struct bdi_writ * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); @@ -762,7 +788,7 @@ static long wb_writeback(struct bdi_writ } spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* --- linux-next.orig/include/linux/writeback.h 2011-06-19 23:57:41.000000000 +0800 +++ linux-next/include/linux/writeback.h 2011-06-20 00:03:25.000000000 +0800 @@ -24,12 +24,9 @@ enum writeback_sync_modes { */ struct writeback_control { enum writeback_sync_modes sync_mode; - unsigned long *older_than_this; /* If !NULL, only write back inodes - older than this */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ - long inodes_written; /* # of inodes written (at least) */ /* * For a_ops->writepages(): is start or end are non-zero then this is @@ -56,8 +53,7 @@ void writeback_inodes_sb_nr(struct super int writeback_inodes_sb_if_idle(struct super_block *); int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); void sync_inodes_sb(struct super_block *); -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages); --- linux-next.orig/mm/backing-dev.c 2011-06-19 23:57:41.000000000 +0800 +++ linux-next/mm/backing-dev.c 2011-06-20 00:13:48.000000000 +0800 @@ -260,18 +260,6 @@ int bdi_has_dirty_io(struct backing_dev_ return wb_has_dirty_io(&bdi->wb); } -static void bdi_flush_io(struct backing_dev_info *bdi) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .range_cyclic = 1, - .nr_to_write = 1024, - }; - - writeback_inodes_wb(&bdi->wb, &wbc); -} - /* * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be @@ -457,9 +445,10 @@ static int bdi_forker_thread(void *ptr) if (IS_ERR(task)) { /* * If thread creation fails, force writeout of - * the bdi from the thread. + * the bdi from the thread. Hopefully 1024 is + * large enough for efficient IO. */ - bdi_flush_io(bdi); + writeback_inodes_wb(&bdi->wb, 1024); } else { /* * The spinlock makes sure we do not lose --- linux-next.orig/mm/page-writeback.c 2011-06-19 23:58:14.000000000 +0800 +++ linux-next/mm/page-writeback.c 2011-06-20 00:03:25.000000000 +0800 @@ -492,13 +492,6 @@ static void balance_dirty_pages(struct a struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); @@ -560,17 +553,17 @@ static void balance_dirty_pages(struct a * threshold otherwise wait until the disk writes catch * up. */ - trace_wbc_balance_dirty_start(&wbc, bdi); + trace_balance_dirty_start(bdi); if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - trace_wbc_balance_dirty_written(&wbc, bdi); + pages_written += writeback_inodes_wb(&bdi->wb, + write_chunk); + trace_balance_dirty_written(bdi, pages_written); if (pages_written >= write_chunk) break; /* We've done our duty */ } - trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); + trace_balance_dirty_wait(bdi); /* * Increase the delay for each loop, up to our previous --- linux-next.orig/include/trace/events/writeback.h 2011-06-19 23:58:12.000000000 +0800 +++ linux-next/include/trace/events/writeback.h 2011-06-19 23:58:23.000000000 +0800 @@ -62,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); +DEFINE_WRITEBACK_WORK_EVENT(writeback_start); +DEFINE_WRITEBACK_WORK_EVENT(writeback_written); +DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), @@ -101,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_reg DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); +DEFINE_WRITEBACK_EVENT(balance_dirty_start); +DEFINE_WRITEBACK_EVENT(balance_dirty_wait); + +TRACE_EVENT(balance_dirty_written, + + TP_PROTO(struct backing_dev_info *bdi, int written), + + TP_ARGS(bdi, written), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(int, written) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->written = written; + ), + + TP_printk("bdi %s written %d", + __entry->name, + __entry->written + ) +); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -114,7 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) - __field(unsigned long, older_than_this) __field(long, range_start) __field(long, range_end) ), @@ -128,14 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; - __entry->older_than_this = wbc->older_than_this ? - *wbc->older_than_this : 0; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " - "bgrd=%d reclm=%d cyclic=%d older=0x%lx " + "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx", __entry->name, __entry->nr_to_write, @@ -145,7 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, - __entry->older_than_this, __entry->range_start, __entry->range_end) ) @@ -154,12 +177,6 @@ DECLARE_EVENT_CLASS(wbc_class, DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) -DEFINE_WBC_EVENT(wbc_writeback_start); -DEFINE_WBC_EVENT(wbc_writeback_written); -DEFINE_WBC_EVENT(wbc_writeback_wait); -DEFINE_WBC_EVENT(wbc_balance_dirty_start); -DEFINE_WBC_EVENT(wbc_balance_dirty_written); -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, --- linux-next.orig/fs/btrfs/extent_io.c 2011-06-19 23:57:40.000000000 +0800 +++ linux-next/fs/btrfs/extent_io.c 2011-06-19 23:58:23.000000000 +0800 @@ -2551,7 +2551,6 @@ int extent_write_full_page(struct extent }; struct writeback_control wbc_writepages = { .sync_mode = wbc->sync_mode, - .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, @@ -2584,7 +2583,6 @@ int extent_write_locked_range(struct ext }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html