On Sat, 2009-05-23 at 21:15 +0200, Jens Axboe wrote: > On Fri, May 22 2009, Jens Axboe wrote: > > Please try with this combined patch against what you are running now, it > > should resolve the issue. It needs a bit more work, but I'm running out > > of time today. I'l get it finalized, cleaned up, and integrated. Then > > I'll post a new revision of the patch set. > > > > This one has been tested good and has a few more tweaks. So please try > that! It should be pretty close to final now, will repost the series on > monday. I ran the workload for 10 times and didn't trigger it yet. So the bug is fixed. yanmin > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index f80afaa..33357c3 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -50,6 +50,7 @@ struct bdi_work { > > unsigned long sb_data; > unsigned long nr_pages; > + enum writeback_sync_modes sync_mode; > > unsigned long state; > }; > @@ -65,19 +66,22 @@ static inline bool bdi_work_on_stack(struct bdi_work *work) > } > > static inline void bdi_work_init(struct bdi_work *work, struct super_block *sb, > - unsigned long nr_pages) > + unsigned long nr_pages, > + enum writeback_sync_modes sync_mode) > { > INIT_RCU_HEAD(&work->rcu_head); > work->sb_data = (unsigned long) sb; > work->nr_pages = nr_pages; > + work->sync_mode = sync_mode; > work->state = 0; > } > > static inline void bdi_work_init_on_stack(struct bdi_work *work, > struct super_block *sb, > - unsigned long nr_pages) > + unsigned long nr_pages, > + enum writeback_sync_modes sync_mode) > { > - bdi_work_init(work, sb, nr_pages); > + bdi_work_init(work, sb, nr_pages, sync_mode); > set_bit(0, &work->state); > work->sb_data |= 1UL; > } > @@ -136,6 +140,9 @@ static void wb_start_writeback(struct bdi_writeback *wb, struct bdi_work *work) > wake_up(&wb->wait); > } > > +/* > + * Add work to bdi work list. > + */ > static int bdi_queue_writeback(struct backing_dev_info *bdi, > struct bdi_work *work) > { > @@ -189,17 +196,17 @@ static void bdi_wait_on_work_start(struct bdi_work *work) > } > > int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, > - long nr_pages) > + long nr_pages, enum writeback_sync_modes sync_mode) > { > struct bdi_work work_stack, *work; > int ret; > > work = kmalloc(sizeof(*work), GFP_ATOMIC); > if (work) > - bdi_work_init(work, sb, nr_pages); > + bdi_work_init(work, sb, nr_pages, sync_mode); > else { > work = &work_stack; > - bdi_work_init_on_stack(work, sb, nr_pages); > + bdi_work_init_on_stack(work, sb, nr_pages, sync_mode); > } > > ret = bdi_queue_writeback(bdi, work); > @@ -273,24 +280,31 @@ static long wb_kupdated(struct bdi_writeback *wb) > return wrote; > } > > +static inline bool over_bground_thresh(void) > +{ > + unsigned long background_thresh, dirty_thresh; > + > + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > + > + return (global_page_state(NR_FILE_DIRTY) + > + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); > +} > + > static long __wb_writeback(struct bdi_writeback *wb, long nr_pages, > - struct super_block *sb) > + struct super_block *sb, > + enum writeback_sync_modes sync_mode) > { > struct writeback_control wbc = { > .bdi = wb->bdi, > - .sync_mode = WB_SYNC_NONE, > + .sync_mode = sync_mode, > .older_than_this = NULL, > .range_cyclic = 1, > }; > long wrote = 0; > > for (;;) { > - unsigned long background_thresh, dirty_thresh; > - > - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > - if ((global_page_state(NR_FILE_DIRTY) + > - global_page_state(NR_UNSTABLE_NFS) < background_thresh) && > - nr_pages <= 0) > + if (sync_mode == WB_SYNC_NONE && nr_pages <= 0 && > + !over_bground_thresh()) > break; > > wbc.more_io = 0; > @@ -345,9 +359,10 @@ static long wb_writeback(struct bdi_writeback *wb) > while ((work = get_next_work_item(bdi, wb)) != NULL) { > struct super_block *sb = bdi_work_sb(work); > long nr_pages = work->nr_pages; > + enum writeback_sync_modes sync_mode = work->sync_mode; > > wb_clear_pending(wb, work); > - wrote += __wb_writeback(wb, nr_pages, sb); > + wrote += __wb_writeback(wb, nr_pages, sb, sync_mode); > } > > return wrote; > @@ -420,39 +435,36 @@ int bdi_writeback_task(struct bdi_writeback *wb) > return 0; > } > > -void bdi_writeback_all(struct super_block *sb, long nr_pages) > +/* > + * Do in-line writeback for all backing devices. Expensive! > + */ > +void bdi_writeback_all(struct super_block *sb, long nr_pages, > + enum writeback_sync_modes sync_mode) > { > - struct list_head *entry = &bdi_list; > + struct backing_dev_info *bdi, *tmp; > > - rcu_read_lock(); > - > - list_for_each_continue_rcu(entry, &bdi_list) { > - struct backing_dev_info *bdi; > - struct list_head *next; > - struct bdi_work *work; > + mutex_lock(&bdi_lock); > > - bdi = list_entry(entry, struct backing_dev_info, bdi_list); > + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { > if (!bdi_has_dirty_io(bdi)) > continue; > > - /* > - * If this allocation fails, we just wakeup the thread and > - * let it do kupdate writeback > - */ > - work = kmalloc(sizeof(*work), GFP_ATOMIC); > - if (work) > - bdi_work_init(work, sb, nr_pages); > + if (!bdi_wblist_needs_lock(bdi)) > + __wb_writeback(&bdi->wb, 0, sb, sync_mode); > + else { > + struct bdi_writeback *wb; > + int idx; > > - /* > - * Prepare to start from previous entry if this one gets moved > - * to the bdi_pending list. > - */ > - next = entry->prev; > - if (bdi_queue_writeback(bdi, work)) > - entry = next; > + idx = srcu_read_lock(&bdi->srcu); > + > + list_for_each_entry_rcu(wb, &bdi->wb_list, list) > + __wb_writeback(&bdi->wb, 0, sb, sync_mode); > + > + srcu_read_unlock(&bdi->srcu, idx); > + } > } > > - rcu_read_unlock(); > + mutex_unlock(&bdi_lock); > } > > /* > @@ -972,9 +984,9 @@ void generic_sync_sb_inodes(struct super_block *sb, > struct writeback_control *wbc) > { > if (wbc->bdi) > - bdi_start_writeback(wbc->bdi, sb, 0); > + generic_sync_bdi_inodes(sb, wbc); > else > - bdi_writeback_all(sb, 0); > + bdi_writeback_all(sb, 0, wbc->sync_mode); > > if (wbc->sync_mode == WB_SYNC_ALL) { > struct inode *inode, *old_inode = NULL; > diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h > index 7c2874f..0b20d4b 100644 > --- a/include/linux/backing-dev.h > +++ b/include/linux/backing-dev.h > @@ -15,6 +15,7 @@ > #include <linux/fs.h> > #include <linux/sched.h> > #include <linux/srcu.h> > +#include <linux/writeback.h> > #include <asm/atomic.h> > > struct page; > @@ -60,7 +61,6 @@ struct bdi_writeback { > #define BDI_MAX_FLUSHERS 32 > > struct backing_dev_info { > - struct rcu_head rcu_head; > struct srcu_struct srcu; /* for wb_list read side protection */ > struct list_head bdi_list; > unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ > @@ -105,14 +105,15 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, > int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); > void bdi_unregister(struct backing_dev_info *bdi); > int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, > - long nr_pages); > + long nr_pages, enum writeback_sync_modes sync_mode); > int bdi_writeback_task(struct bdi_writeback *wb); > -void bdi_writeback_all(struct super_block *sb, long nr_pages); > +void bdi_writeback_all(struct super_block *sb, long nr_pages, > + enum writeback_sync_modes sync_mode); > void bdi_add_default_flusher_task(struct backing_dev_info *bdi); > void bdi_add_flusher_task(struct backing_dev_info *bdi); > int bdi_has_dirty_io(struct backing_dev_info *bdi); > > -extern spinlock_t bdi_lock; > +extern struct mutex bdi_lock; > extern struct list_head bdi_list; > > static inline int wb_is_default_task(struct bdi_writeback *wb) > diff --git a/mm/backing-dev.c b/mm/backing-dev.c > index 60578bc..3ce3b57 100644 > --- a/mm/backing-dev.c > +++ b/mm/backing-dev.c > @@ -26,7 +26,7 @@ struct backing_dev_info default_backing_dev_info = { > EXPORT_SYMBOL_GPL(default_backing_dev_info); > > static struct class *bdi_class; > -DEFINE_SPINLOCK(bdi_lock); > +DEFINE_MUTEX(bdi_lock); > LIST_HEAD(bdi_list); > LIST_HEAD(bdi_pending_list); > > @@ -360,14 +360,15 @@ static int bdi_start_fn(void *ptr) > * Clear pending bit and wakeup anybody waiting to tear us down > */ > clear_bit(BDI_pending, &bdi->state); > + smp_mb__after_clear_bit(); > wake_up_bit(&bdi->state, BDI_pending); > > /* > * Make us discoverable on the bdi_list again > */ > - spin_lock_bh(&bdi_lock); > - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); > - spin_unlock_bh(&bdi_lock); > + mutex_lock(&bdi_lock); > + list_add_tail(&bdi->bdi_list, &bdi_list); > + mutex_unlock(&bdi_lock); > > ret = bdi_writeback_task(wb); > > @@ -419,15 +420,9 @@ static int bdi_forker_task(void *ptr) > bdi_task_init(me->bdi, me); > > for (;;) { > - struct backing_dev_info *bdi; > + struct backing_dev_info *bdi, *tmp; > struct bdi_writeback *wb; > > - prepare_to_wait(&me->wait, &wait, TASK_INTERRUPTIBLE); > - > - smp_mb(); > - if (list_empty(&bdi_pending_list)) > - schedule(); > - > /* > * Ideally we'd like not to see any dirty inodes on the > * default_backing_dev_info. Until these are tracked down, > @@ -438,19 +433,39 @@ static int bdi_forker_task(void *ptr) > if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) > wb_do_writeback(me); > > + prepare_to_wait(&me->wait, &wait, TASK_INTERRUPTIBLE); > + > + mutex_lock(&bdi_lock); > + > + /* > + * Check if any existing bdi's have dirty data without > + * a thread registered. If so, set that up. > + */ > + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { > + if (!list_empty(&bdi->wb_list) || > + !bdi_has_dirty_io(bdi)) > + continue; > + > + bdi_add_default_flusher_task(bdi); > + } > + > + if (list_empty(&bdi_pending_list)) { > + unsigned long wait; > + > + mutex_unlock(&bdi_lock); > + wait = msecs_to_jiffies(dirty_writeback_interval * 10); > + schedule_timeout(wait); > + continue; > + } > + > /* > * This is our real job - check for pending entries in > * bdi_pending_list, and create the tasks that got added > */ > -repeat: > - bdi = NULL; > - spin_lock_bh(&bdi_lock); > - if (!list_empty(&bdi_pending_list)) { > - bdi = list_entry(bdi_pending_list.next, > + bdi = list_entry(bdi_pending_list.next, > struct backing_dev_info, bdi_list); > - list_del_init(&bdi->bdi_list); > - } > - spin_unlock_bh(&bdi_lock); > + list_del_init(&bdi->bdi_list); > + mutex_unlock(&bdi_lock); > > if (!bdi) > continue; > @@ -475,12 +490,11 @@ readd_flush: > * a chance to flush other bdi's to free > * memory. > */ > - spin_lock_bh(&bdi_lock); > + mutex_lock(&bdi_lock); > list_add_tail(&bdi->bdi_list, &bdi_pending_list); > - spin_unlock_bh(&bdi_lock); > + mutex_unlock(&bdi_lock); > > bdi_flush_io(bdi); > - goto repeat; > } > } > > @@ -489,25 +503,8 @@ readd_flush: > } > > /* > - * Grace period has now ended, init bdi->bdi_list and add us to the > - * list of bdi's that are pending for task creation. Wake up > - * bdi_forker_task() to finish the job and add us back to the > - * active bdi_list. > + * bdi_lock held on entry > */ > -static void bdi_add_to_pending(struct rcu_head *head) > -{ > - struct backing_dev_info *bdi; > - > - bdi = container_of(head, struct backing_dev_info, rcu_head); > - INIT_LIST_HEAD(&bdi->bdi_list); > - > - spin_lock(&bdi_lock); > - list_add_tail(&bdi->bdi_list, &bdi_pending_list); > - spin_unlock(&bdi_lock); > - > - wake_up(&default_backing_dev_info.wb.wait); > -} > - > static void bdi_add_one_flusher_task(struct backing_dev_info *bdi, > int(*func)(struct backing_dev_info *)) > { > @@ -526,24 +523,22 @@ static void bdi_add_one_flusher_task(struct backing_dev_info *bdi, > * waiting for previous additions to finish. > */ > if (!func(bdi)) { > - spin_lock_bh(&bdi_lock); > - list_del_rcu(&bdi->bdi_list); > - spin_unlock_bh(&bdi_lock); > + list_move_tail(&bdi->bdi_list, &bdi_pending_list); > > /* > - * We need to wait for the current grace period to end, > - * in case others were browsing the bdi_list as well. > - * So defer the adding and wakeup to after the RCU > - * grace period has ended. > + * We are now on the pending list, wake up bdi_forker_task() > + * to finish the job and add us abck to the active bdi_list > */ > - call_rcu(&bdi->rcu_head, bdi_add_to_pending); > + wake_up(&default_backing_dev_info.wb.wait); > } > } > > static int flusher_add_helper_block(struct backing_dev_info *bdi) > { > + mutex_unlock(&bdi_lock); > wait_on_bit_lock(&bdi->state, BDI_pending, bdi_sched_wait, > TASK_UNINTERRUPTIBLE); > + mutex_lock(&bdi_lock); > return 0; > } > > @@ -571,7 +566,9 @@ void bdi_add_default_flusher_task(struct backing_dev_info *bdi) > */ > void bdi_add_flusher_task(struct backing_dev_info *bdi) > { > + mutex_lock(&bdi_lock); > bdi_add_one_flusher_task(bdi, flusher_add_helper_block); > + mutex_unlock(&bdi_lock); > } > EXPORT_SYMBOL(bdi_add_flusher_task); > > @@ -593,6 +590,14 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, > goto exit; > } > > + mutex_lock(&bdi_lock); > + list_add_tail(&bdi->bdi_list, &bdi_list); > + mutex_unlock(&bdi_lock); > + > + bdi->dev = dev; > + bdi_debug_register(bdi, dev_name(dev)); > + set_bit(BDI_registered, &bdi->state); > + > /* > * Just start the forker thread for our default backing_dev_info, > * and add other bdi's to the list. They will get a thread created > @@ -616,14 +621,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, > } > } > > - spin_lock_bh(&bdi_lock); > - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); > - spin_unlock_bh(&bdi_lock); > - > - bdi->dev = dev; > - bdi_debug_register(bdi, dev_name(dev)); > - set_bit(BDI_registered, &bdi->state); > - > exit: > return ret; > } > @@ -655,15 +652,9 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) > /* > * Make sure nobody finds us on the bdi_list anymore > */ > - spin_lock_bh(&bdi_lock); > - list_del_rcu(&bdi->bdi_list); > - spin_unlock_bh(&bdi_lock); > - > - /* > - * Now make sure that anybody who is currently looking at us from > - * the bdi_list iteration have exited. > - */ > - synchronize_rcu(); > + mutex_lock(&bdi_lock); > + list_del(&bdi->bdi_list); > + mutex_unlock(&bdi_lock); > > /* > * Finally, kill the kernel threads. We don't need to be RCU > @@ -689,7 +680,6 @@ int bdi_init(struct backing_dev_info *bdi) > { > int i, err; > > - INIT_RCU_HEAD(&bdi->rcu_head); > bdi->dev = NULL; > > bdi->min_ratio = 0; > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index de3178a..7dd7de7 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -313,9 +313,8 @@ static unsigned int bdi_min_ratio; > int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) > { > int ret = 0; > - unsigned long flags; > > - spin_lock_irqsave(&bdi_lock, flags); > + mutex_lock(&bdi_lock); > if (min_ratio > bdi->max_ratio) { > ret = -EINVAL; > } else { > @@ -327,27 +326,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) > ret = -EINVAL; > } > } > - spin_unlock_irqrestore(&bdi_lock, flags); > + mutex_unlock(&bdi_lock); > > return ret; > } > > int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) > { > - unsigned long flags; > int ret = 0; > > if (max_ratio > 100) > return -EINVAL; > > - spin_lock_irqsave(&bdi_lock, flags); > + mutex_lock(&bdi_lock); > if (bdi->min_ratio > max_ratio) { > ret = -EINVAL; > } else { > bdi->max_ratio = max_ratio; > bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; > } > - spin_unlock_irqrestore(&bdi_lock, flags); > + mutex_unlock(&bdi_lock); > > return ret; > } > @@ -581,7 +579,7 @@ static void balance_dirty_pages(struct address_space *mapping) > (!laptop_mode && (global_page_state(NR_FILE_DIRTY) > + global_page_state(NR_UNSTABLE_NFS) > > background_thresh))) > - bdi_start_writeback(bdi, NULL, 0); > + bdi_start_writeback(bdi, NULL, 0, WB_SYNC_NONE); > } > > void set_page_dirty_balance(struct page *page, int page_mkwrite) > @@ -674,7 +672,7 @@ void wakeup_flusher_threads(long nr_pages) > if (nr_pages == 0) > nr_pages = global_page_state(NR_FILE_DIRTY) + > global_page_state(NR_UNSTABLE_NFS); > - bdi_writeback_all(NULL, nr_pages); > + bdi_writeback_all(NULL, nr_pages, WB_SYNC_NONE); > } > > static void laptop_timer_fn(unsigned long unused); > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html