On Wed, Sep 30, 2009 at 07:55:39PM +0800, Jan Kara wrote: > > writeback: let balance_dirty_pages() wait on background writeback > > > > CC: Chris Mason <chris.mason@xxxxxxxxxx> > > CC: Dave Chinner <david@xxxxxxxxxxxxx> > > CC: Jan Kara <jack@xxxxxxx> > > CC: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> > > CC: Jens Axboe <jens.axboe@xxxxxxxxxx> > > Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> > > --- > > fs/fs-writeback.c | 89 ++++++++++++++++++++++++++++++++-- > > include/linux/backing-dev.h | 15 +++++ > > mm/backing-dev.c | 4 + > > mm/page-writeback.c | 43 ++-------------- > > 4 files changed, 109 insertions(+), 42 deletions(-) > > > > --- linux.orig/mm/page-writeback.c 2009-09-28 19:01:40.000000000 +0800 > > +++ linux/mm/page-writeback.c 2009-09-28 19:02:48.000000000 +0800 > > @@ -218,6 +218,10 @@ static inline void __bdi_writeout_inc(st > > { > > __prop_inc_percpu_max(&vm_completions, &bdi->completions, > > bdi->max_prop_frac); > > + > > + if (atomic_read(&bdi->throttle_pages) < DIRTY_THROTTLE_PAGES_STOP && > > + atomic_dec_and_test(&bdi->throttle_pages)) > > + bdi_writeback_wakeup(bdi); > > } > > > > void bdi_writeout_inc(struct backing_dev_info *bdi) > > @@ -458,20 +462,10 @@ static void balance_dirty_pages(struct a > > unsigned long background_thresh; > > unsigned long dirty_thresh; > > unsigned long bdi_thresh; > > - unsigned long pages_written = 0; > > - unsigned long pause = 1; > > int dirty_exceeded; > > struct backing_dev_info *bdi = mapping->backing_dev_info; > > > > for (;;) { > > - struct writeback_control wbc = { > > - .bdi = bdi, > > - .sync_mode = WB_SYNC_NONE, > > - .older_than_this = NULL, > > - .nr_to_write = write_chunk, > > - .range_cyclic = 1, > > - }; > > - > > nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > > global_page_state(NR_UNSTABLE_NFS); > > nr_writeback = global_page_state(NR_WRITEBACK) + > > @@ -518,31 +512,7 @@ static void balance_dirty_pages(struct a > > if (!bdi->dirty_exceeded) > > bdi->dirty_exceeded = 1; > > > > - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. > > - * Unstable writes are a feature of certain networked > > - * filesystems (i.e. NFS) in which data may have been > > - * written to the server's write cache, but has not yet > > - * been flushed to permanent storage. > > - * Only move pages to writeback if this bdi is over its > > - * threshold otherwise wait until the disk writes catch > > - * up. > > - */ > > - if (bdi_nr_reclaimable > bdi_thresh) { > > - writeback_inodes_wbc(&wbc); > > - pages_written += write_chunk - wbc.nr_to_write; > > - /* don't wait if we've done enough */ > > - if (pages_written >= write_chunk) > > - break; > > - } > > - schedule_timeout_interruptible(pause); > > - > > - /* > > - * Increase the delay for each loop, up to our previous > > - * default of taking a 100ms nap. > > - */ > > - pause <<= 1; > > - if (pause > HZ / 10) > > - pause = HZ / 10; > > + bdi_writeback_wait(bdi, write_chunk); Added a "break;" line here: we can remove the loop now :) > > } > > > > if (!dirty_exceeded && bdi->dirty_exceeded) > > @@ -559,8 +529,7 @@ static void balance_dirty_pages(struct a > > * In normal mode, we start background writeout at the lower > > * background_thresh, to keep the amount of dirty memory low. > > */ > > - if ((laptop_mode && pages_written) || > > - (!laptop_mode && (nr_reclaimable > background_thresh))) > > + if (!laptop_mode && (nr_reclaimable > background_thresh)) > > bdi_start_writeback(bdi, NULL, 0); > > } > > > > --- linux.orig/include/linux/backing-dev.h 2009-09-28 18:52:51.000000000 +0800 > > +++ linux/include/linux/backing-dev.h 2009-09-28 19:02:45.000000000 +0800 > > @@ -86,6 +86,13 @@ struct backing_dev_info { > > > > struct list_head work_list; > > > > + /* > > + * dirtier process throttling > > + */ > > + spinlock_t throttle_lock; > > + struct list_head throttle_list; /* nr to sync for each task */ > > + atomic_t throttle_pages; /* nr to sync for head task */ > > + > > struct device *dev; > > > > #ifdef CONFIG_DEBUG_FS > > @@ -94,6 +101,12 @@ struct backing_dev_info { > > #endif > > }; > > > > +/* > > + * when no task is throttled, set throttle_pages to larger than this, > > + * to avoid unnecessary atomic decreases. > > + */ > > +#define DIRTY_THROTTLE_PAGES_STOP (1 << 22) > > + > > int bdi_init(struct backing_dev_info *bdi); > > void bdi_destroy(struct backing_dev_info *bdi); > > > > @@ -105,6 +118,8 @@ void bdi_start_writeback(struct backing_ > > long nr_pages); > > int bdi_writeback_task(struct bdi_writeback *wb); > > int bdi_has_dirty_io(struct backing_dev_info *bdi); > > +int bdi_writeback_wakeup(struct backing_dev_info *bdi); > > +void bdi_writeback_wait(struct backing_dev_info *bdi, long nr_pages); > > > > extern spinlock_t bdi_lock; > > extern struct list_head bdi_list; > > --- linux.orig/fs/fs-writeback.c 2009-09-28 18:57:51.000000000 +0800 > > +++ linux/fs/fs-writeback.c 2009-09-28 19:02:45.000000000 +0800 > > @@ -25,6 +25,7 @@ > > #include <linux/blkdev.h> > > #include <linux/backing-dev.h> > > #include <linux/buffer_head.h> > > +#include <linux/completion.h> > > #include "internal.h" > > > > #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) > > @@ -136,14 +137,14 @@ static void wb_work_complete(struct bdi_ > > call_rcu(&work->rcu_head, bdi_work_free); > > } > > > > -static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) > > +static void wb_clear_pending(struct backing_dev_info *bdi, > > + struct bdi_work *work) > > { > > /* > > * The caller has retrieved the work arguments from this work, > > * drop our reference. If this is the last ref, delete and free it > > */ > > if (atomic_dec_and_test(&work->pending)) { > > - struct backing_dev_info *bdi = wb->bdi; > > > > spin_lock(&bdi->wb_lock); > > list_del_rcu(&work->list); > > @@ -275,6 +276,81 @@ void bdi_start_writeback(struct backing_ > > bdi_alloc_queue_work(bdi, &args); > > } > > > > +struct dirty_throttle_task { > > + long nr_pages; > > + struct list_head list; > > + struct completion complete; > > +}; > > + > > +void bdi_writeback_wait(struct backing_dev_info *bdi, long nr_pages) > > +{ > > + struct dirty_throttle_task tt = { > > + .nr_pages = nr_pages, > > + .complete = COMPLETION_INITIALIZER_ONSTACK(tt.complete), > > + }; > > + struct wb_writeback_args args = { > > + .sync_mode = WB_SYNC_NONE, > > + .nr_pages = LONG_MAX, > > + .range_cyclic = 1, > > + .for_background = 1, > > + }; > > + struct bdi_work work; > > + > > + bdi_work_init(&work, &args); > > + work.state |= WS_ONSTACK; > > + > > + /* > > + * make sure we will be waken up by someone > > + */ > > + bdi_queue_work(bdi, &work); > This is wrong, you shouldn't submit the work like this because you'll > have to wait for completion (wb_clear_pending below is just bogus). You > should rather do bdi_start_writeback(bdi, NULL, 0). No I don't intent to wait for completion of this work (that would wait too long). This bdi work is to ensure writeback IO submissions are now in progress. Thus __bdi_writeout_inc() will be called to decrease bdi->throttle_pages, and when it counts down to 0, wake up this process. The alternative way is to do if (no background work queued) bdi_start_writeback(bdi, NULL, 0) It looks a saner solution, thanks for the suggestion :) > > + > > + /* > > + * register throttle pages > > + */ > > + spin_lock(&bdi->throttle_lock); > > + if (list_empty(&bdi->throttle_list)) > > + atomic_set(&bdi->throttle_pages, nr_pages); > > + list_add(&tt.list, &bdi->throttle_list); > > + spin_unlock(&bdi->throttle_lock); > > + > > + wait_for_completion(&tt.complete); > > + wb_clear_pending(bdi, &work); /* XXX */ For the above reason, I remove the work here and don't care whether it has been executed or is running or not seen at all. We have been waken up. Sorry I definitely "misused" wb_clear_pending() for a slightly different purpose.. This didn't really cancel the work if it has already been running. So bdi_writeback_wait() achieves another goal of starting background writeback if bdi-flush is previously idle. > > +} > > + > > +/* > > + * return 1 if there are more waiting tasks. > > + */ > > +int bdi_writeback_wakeup(struct backing_dev_info *bdi) > > +{ > > + struct dirty_throttle_task *tt; > > + > > + spin_lock(&bdi->throttle_lock); > > + /* > > + * remove and wakeup head task > > + */ > > + if (!list_empty(&bdi->throttle_list)) { > > + tt = list_entry(bdi->throttle_list.prev, > > + struct dirty_throttle_task, list); > > + list_del(&tt->list); > > + complete(&tt->complete); > > + } > > + /* > > + * update throttle pages > > + */ > > + if (!list_empty(&bdi->throttle_list)) { > > + tt = list_entry(bdi->throttle_list.prev, > > + struct dirty_throttle_task, list); > > + atomic_set(&bdi->throttle_pages, tt->nr_pages); > > + } else { > > + tt = NULL; > > + atomic_set(&bdi->throttle_pages, DIRTY_THROTTLE_PAGES_STOP * 2); > Why is here * 2? Because we do a racy test in another place: + if (atomic_read(&bdi->throttle_pages) < DIRTY_THROTTLE_PAGES_STOP && + atomic_dec_and_test(&bdi->throttle_pages)) + bdi_writeback_wakeup(bdi); The *2 is for reducing the race possibility. It might still be racy, but that's OK, because it's mainly an optimization. It's perfectly correct if we simply do + if (atomic_dec_and_test(&bdi->throttle_pages)) + bdi_writeback_wakeup(bdi); > > + } > > + spin_unlock(&bdi->throttle_lock); > > + > > + return tt != NULL; > > +} > > + > > /* > > * Redirty an inode: set its when-it-was dirtied timestamp and move it to the > > * furthest end of its superblock's dirty-inode list. > > @@ -788,8 +864,11 @@ static long wb_writeback(struct bdi_writ > > * For background writeout, stop when we are below the > > * background dirty threshold > > */ > > - if (args->for_background && !over_bground_thresh()) > > + if (args->for_background && !over_bground_thresh()) { > > + while (bdi_writeback_wakeup(wb->bdi)) > > + ; /* unthrottle all tasks */ > > break; > > + } > You probably didn't understand my comment in the previous email. This is > too late to wakeup all the tasks. There are two limits - background_limit > (set to 5%) and dirty_limit (set to 10%). When amount of dirty data is > above background_limit, we start the writeback but we don't throttle tasks > yet. We start throttling tasks only when amount of dirty data on the bdi > exceeds the part of the dirty limit belonging to the bdi. In case of a > single bdi, this means we start throttling threads only when 10% of memory > is dirty. To keep this behavior, we have to wakeup waiting threads as soon > as their BDI gets below the dirty limit or when global number of dirty > pages gets below (background_limit + dirty_limit) / 2. Sure, but the design goal is to wakeup the throttled tasks in the __bdi_writeout_inc() path instead of here. As long as some (background) writeback is running, __bdi_writeout_inc() will be called to wakeup the tasks. This "unthrottle all on exit of background writeback" is merely a safeguard, since once background writeback (which could be queued by the throttled task itself, in bdi_writeback_wait) exits, the calls to __bdi_writeout_inc() is likely to stop. > > > > wbc.more_io = 0; > > wbc.encountered_congestion = 0; > > @@ -911,7 +990,7 @@ long wb_do_writeback(struct bdi_writebac > > * that we have seen this work and we are now starting it. > > */ > > if (args.sync_mode == WB_SYNC_NONE) > > - wb_clear_pending(wb, work); > > + wb_clear_pending(bdi, work); > > > > wrote += wb_writeback(wb, &args); > > > > @@ -920,7 +999,7 @@ long wb_do_writeback(struct bdi_writebac > > * notification when we have completed the work. > > */ > > if (args.sync_mode == WB_SYNC_ALL) > > - wb_clear_pending(wb, work); > > + wb_clear_pending(bdi, work); > > } > > > > /* > > --- linux.orig/mm/backing-dev.c 2009-09-28 18:52:18.000000000 +0800 > > +++ linux/mm/backing-dev.c 2009-09-28 19:02:45.000000000 +0800 > > @@ -645,6 +645,10 @@ int bdi_init(struct backing_dev_info *bd > > bdi->wb_mask = 1; > > bdi->wb_cnt = 1; > > > > + spin_lock_init(&bdi->throttle_lock); > > + INIT_LIST_HEAD(&bdi->throttle_list); > > + atomic_set(&bdi->throttle_pages, DIRTY_THROTTLE_PAGES_STOP * 2); > > + > Again, why is * 2 here? I'd just set DIRTY_THROTTLE_PAGES_STOP to some > magic value (like ~0) and use it directly... See above. ~0 is not used because atomic_t only promises 24bit data space, and to reduce a small race :) Thanks, Fengguang > > for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { > > err = percpu_counter_init(&bdi->bdi_stat[i], 0); > > if (err) > > Honza > -- > Jan Kara <jack@xxxxxxx> > SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html