On Fri, May 22 2009, Jens Axboe wrote: > Please try with this combined patch against what you are running now, it > should resolve the issue. It needs a bit more work, but I'm running out > of time today. I'l get it finalized, cleaned up, and integrated. Then > I'll post a new revision of the patch set. > This one has been tested good and has a few more tweaks. So please try that! It should be pretty close to final now, will repost the series on monday. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f80afaa..33357c3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -50,6 +50,7 @@ struct bdi_work { unsigned long sb_data; unsigned long nr_pages; + enum writeback_sync_modes sync_mode; unsigned long state; }; @@ -65,19 +66,22 @@ static inline bool bdi_work_on_stack(struct bdi_work *work) } static inline void bdi_work_init(struct bdi_work *work, struct super_block *sb, - unsigned long nr_pages) + unsigned long nr_pages, + enum writeback_sync_modes sync_mode) { INIT_RCU_HEAD(&work->rcu_head); work->sb_data = (unsigned long) sb; work->nr_pages = nr_pages; + work->sync_mode = sync_mode; work->state = 0; } static inline void bdi_work_init_on_stack(struct bdi_work *work, struct super_block *sb, - unsigned long nr_pages) + unsigned long nr_pages, + enum writeback_sync_modes sync_mode) { - bdi_work_init(work, sb, nr_pages); + bdi_work_init(work, sb, nr_pages, sync_mode); set_bit(0, &work->state); work->sb_data |= 1UL; } @@ -136,6 +140,9 @@ static void wb_start_writeback(struct bdi_writeback *wb, struct bdi_work *work) wake_up(&wb->wait); } +/* + * Add work to bdi work list. + */ static int bdi_queue_writeback(struct backing_dev_info *bdi, struct bdi_work *work) { @@ -189,17 +196,17 @@ static void bdi_wait_on_work_start(struct bdi_work *work) } int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages) + long nr_pages, enum writeback_sync_modes sync_mode) { struct bdi_work work_stack, *work; int ret; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) - bdi_work_init(work, sb, nr_pages); + bdi_work_init(work, sb, nr_pages, sync_mode); else { work = &work_stack; - bdi_work_init_on_stack(work, sb, nr_pages); + bdi_work_init_on_stack(work, sb, nr_pages, sync_mode); } ret = bdi_queue_writeback(bdi, work); @@ -273,24 +280,31 @@ static long wb_kupdated(struct bdi_writeback *wb) return wrote; } +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); +} + static long __wb_writeback(struct bdi_writeback *wb, long nr_pages, - struct super_block *sb) + struct super_block *sb, + enum writeback_sync_modes sync_mode) { struct writeback_control wbc = { .bdi = wb->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = sync_mode, .older_than_this = NULL, .range_cyclic = 1, }; long wrote = 0; for (;;) { - unsigned long background_thresh, dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if ((global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh) && - nr_pages <= 0) + if (sync_mode == WB_SYNC_NONE && nr_pages <= 0 && + !over_bground_thresh()) break; wbc.more_io = 0; @@ -345,9 +359,10 @@ static long wb_writeback(struct bdi_writeback *wb) while ((work = get_next_work_item(bdi, wb)) != NULL) { struct super_block *sb = bdi_work_sb(work); long nr_pages = work->nr_pages; + enum writeback_sync_modes sync_mode = work->sync_mode; wb_clear_pending(wb, work); - wrote += __wb_writeback(wb, nr_pages, sb); + wrote += __wb_writeback(wb, nr_pages, sb, sync_mode); } return wrote; @@ -420,39 +435,36 @@ int bdi_writeback_task(struct bdi_writeback *wb) return 0; } -void bdi_writeback_all(struct super_block *sb, long nr_pages) +/* + * Do in-line writeback for all backing devices. Expensive! + */ +void bdi_writeback_all(struct super_block *sb, long nr_pages, + enum writeback_sync_modes sync_mode) { - struct list_head *entry = &bdi_list; + struct backing_dev_info *bdi, *tmp; - rcu_read_lock(); - - list_for_each_continue_rcu(entry, &bdi_list) { - struct backing_dev_info *bdi; - struct list_head *next; - struct bdi_work *work; + mutex_lock(&bdi_lock); - bdi = list_entry(entry, struct backing_dev_info, bdi_list); + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - /* - * If this allocation fails, we just wakeup the thread and - * let it do kupdate writeback - */ - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) - bdi_work_init(work, sb, nr_pages); + if (!bdi_wblist_needs_lock(bdi)) + __wb_writeback(&bdi->wb, 0, sb, sync_mode); + else { + struct bdi_writeback *wb; + int idx; - /* - * Prepare to start from previous entry if this one gets moved - * to the bdi_pending list. - */ - next = entry->prev; - if (bdi_queue_writeback(bdi, work)) - entry = next; + idx = srcu_read_lock(&bdi->srcu); + + list_for_each_entry_rcu(wb, &bdi->wb_list, list) + __wb_writeback(&bdi->wb, 0, sb, sync_mode); + + srcu_read_unlock(&bdi->srcu, idx); + } } - rcu_read_unlock(); + mutex_unlock(&bdi_lock); } /* @@ -972,9 +984,9 @@ void generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { if (wbc->bdi) - bdi_start_writeback(wbc->bdi, sb, 0); + generic_sync_bdi_inodes(sb, wbc); else - bdi_writeback_all(sb, 0); + bdi_writeback_all(sb, 0, wbc->sync_mode); if (wbc->sync_mode == WB_SYNC_ALL) { struct inode *inode, *old_inode = NULL; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 7c2874f..0b20d4b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -15,6 +15,7 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/srcu.h> +#include <linux/writeback.h> #include <asm/atomic.h> struct page; @@ -60,7 +61,6 @@ struct bdi_writeback { #define BDI_MAX_FLUSHERS 32 struct backing_dev_info { - struct rcu_head rcu_head; struct srcu_struct srcu; /* for wb_list read side protection */ struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ @@ -105,14 +105,15 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages); + long nr_pages, enum writeback_sync_modes sync_mode); int bdi_writeback_task(struct bdi_writeback *wb); -void bdi_writeback_all(struct super_block *sb, long nr_pages); +void bdi_writeback_all(struct super_block *sb, long nr_pages, + enum writeback_sync_modes sync_mode); void bdi_add_default_flusher_task(struct backing_dev_info *bdi); void bdi_add_flusher_task(struct backing_dev_info *bdi); int bdi_has_dirty_io(struct backing_dev_info *bdi); -extern spinlock_t bdi_lock; +extern struct mutex bdi_lock; extern struct list_head bdi_list; static inline int wb_is_default_task(struct bdi_writeback *wb) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 60578bc..3ce3b57 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -26,7 +26,7 @@ struct backing_dev_info default_backing_dev_info = { EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; -DEFINE_SPINLOCK(bdi_lock); +DEFINE_MUTEX(bdi_lock); LIST_HEAD(bdi_list); LIST_HEAD(bdi_pending_list); @@ -360,14 +360,15 @@ static int bdi_start_fn(void *ptr) * Clear pending bit and wakeup anybody waiting to tear us down */ clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); wake_up_bit(&bdi->state, BDI_pending); /* * Make us discoverable on the bdi_list again */ - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + mutex_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_list); + mutex_unlock(&bdi_lock); ret = bdi_writeback_task(wb); @@ -419,15 +420,9 @@ static int bdi_forker_task(void *ptr) bdi_task_init(me->bdi, me); for (;;) { - struct backing_dev_info *bdi; + struct backing_dev_info *bdi, *tmp; struct bdi_writeback *wb; - prepare_to_wait(&me->wait, &wait, TASK_INTERRUPTIBLE); - - smp_mb(); - if (list_empty(&bdi_pending_list)) - schedule(); - /* * Ideally we'd like not to see any dirty inodes on the * default_backing_dev_info. Until these are tracked down, @@ -438,19 +433,39 @@ static int bdi_forker_task(void *ptr) if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) wb_do_writeback(me); + prepare_to_wait(&me->wait, &wait, TASK_INTERRUPTIBLE); + + mutex_lock(&bdi_lock); + + /* + * Check if any existing bdi's have dirty data without + * a thread registered. If so, set that up. + */ + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + if (!list_empty(&bdi->wb_list) || + !bdi_has_dirty_io(bdi)) + continue; + + bdi_add_default_flusher_task(bdi); + } + + if (list_empty(&bdi_pending_list)) { + unsigned long wait; + + mutex_unlock(&bdi_lock); + wait = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait); + continue; + } + /* * This is our real job - check for pending entries in * bdi_pending_list, and create the tasks that got added */ -repeat: - bdi = NULL; - spin_lock_bh(&bdi_lock); - if (!list_empty(&bdi_pending_list)) { - bdi = list_entry(bdi_pending_list.next, + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, bdi_list); - list_del_init(&bdi->bdi_list); - } - spin_unlock_bh(&bdi_lock); + list_del_init(&bdi->bdi_list); + mutex_unlock(&bdi_lock); if (!bdi) continue; @@ -475,12 +490,11 @@ readd_flush: * a chance to flush other bdi's to free * memory. */ - spin_lock_bh(&bdi_lock); + mutex_lock(&bdi_lock); list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock_bh(&bdi_lock); + mutex_unlock(&bdi_lock); bdi_flush_io(bdi); - goto repeat; } } @@ -489,25 +503,8 @@ readd_flush: } /* - * Grace period has now ended, init bdi->bdi_list and add us to the - * list of bdi's that are pending for task creation. Wake up - * bdi_forker_task() to finish the job and add us back to the - * active bdi_list. + * bdi_lock held on entry */ -static void bdi_add_to_pending(struct rcu_head *head) -{ - struct backing_dev_info *bdi; - - bdi = container_of(head, struct backing_dev_info, rcu_head); - INIT_LIST_HEAD(&bdi->bdi_list); - - spin_lock(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock(&bdi_lock); - - wake_up(&default_backing_dev_info.wb.wait); -} - static void bdi_add_one_flusher_task(struct backing_dev_info *bdi, int(*func)(struct backing_dev_info *)) { @@ -526,24 +523,22 @@ static void bdi_add_one_flusher_task(struct backing_dev_info *bdi, * waiting for previous additions to finish. */ if (!func(bdi)) { - spin_lock_bh(&bdi_lock); - list_del_rcu(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); + list_move_tail(&bdi->bdi_list, &bdi_pending_list); /* - * We need to wait for the current grace period to end, - * in case others were browsing the bdi_list as well. - * So defer the adding and wakeup to after the RCU - * grace period has ended. + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us abck to the active bdi_list */ - call_rcu(&bdi->rcu_head, bdi_add_to_pending); + wake_up(&default_backing_dev_info.wb.wait); } } static int flusher_add_helper_block(struct backing_dev_info *bdi) { + mutex_unlock(&bdi_lock); wait_on_bit_lock(&bdi->state, BDI_pending, bdi_sched_wait, TASK_UNINTERRUPTIBLE); + mutex_lock(&bdi_lock); return 0; } @@ -571,7 +566,9 @@ void bdi_add_default_flusher_task(struct backing_dev_info *bdi) */ void bdi_add_flusher_task(struct backing_dev_info *bdi) { + mutex_lock(&bdi_lock); bdi_add_one_flusher_task(bdi, flusher_add_helper_block); + mutex_unlock(&bdi_lock); } EXPORT_SYMBOL(bdi_add_flusher_task); @@ -593,6 +590,14 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } + mutex_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_list); + mutex_unlock(&bdi_lock); + + bdi->dev = dev; + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); + /* * Just start the forker thread for our default backing_dev_info, * and add other bdi's to the list. They will get a thread created @@ -616,14 +621,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, } } - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - - bdi->dev = dev; - bdi_debug_register(bdi, dev_name(dev)); - set_bit(BDI_registered, &bdi->state); - exit: return ret; } @@ -655,15 +652,9 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) /* * Make sure nobody finds us on the bdi_list anymore */ - spin_lock_bh(&bdi_lock); - list_del_rcu(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); - - /* - * Now make sure that anybody who is currently looking at us from - * the bdi_list iteration have exited. - */ - synchronize_rcu(); + mutex_lock(&bdi_lock); + list_del(&bdi->bdi_list); + mutex_unlock(&bdi_lock); /* * Finally, kill the kernel threads. We don't need to be RCU @@ -689,7 +680,6 @@ int bdi_init(struct backing_dev_info *bdi) { int i, err; - INIT_RCU_HEAD(&bdi->rcu_head); bdi->dev = NULL; bdi->min_ratio = 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index de3178a..7dd7de7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -313,9 +313,8 @@ static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + mutex_lock(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -327,27 +326,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + mutex_unlock(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + mutex_lock(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + mutex_unlock(&bdi_lock); return ret; } @@ -581,7 +579,7 @@ static void balance_dirty_pages(struct address_space *mapping) (!laptop_mode && (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) > background_thresh))) - bdi_start_writeback(bdi, NULL, 0); + bdi_start_writeback(bdi, NULL, 0, WB_SYNC_NONE); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -674,7 +672,7 @@ void wakeup_flusher_threads(long nr_pages) if (nr_pages == 0) nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - bdi_writeback_all(NULL, nr_pages); + bdi_writeback_all(NULL, nr_pages, WB_SYNC_NONE); } static void laptop_timer_fn(unsigned long unused); -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html