On Thu, Apr 21, 2011 at 03:17:57PM +0800, Christoph Hellwig wrote: > Here's the inode_wb_list_lock splitup against current mainline: So quick! I'll carry it and collect some numbers in my tests btw. Thanks, Fengguang > --- > From: Christoph Hellwig <hch@xxxxxx> > Subject: [PATCH] writeback: split inode_wb_list_lock > > Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, > as it's currently the most contended lock in the system for metadata > heavy workloads. I won't help for single-filesystem workloads for > which we'll need the I/O-less balance_dirty_pages, but at least we > can dedicate a cpu to spinning on each bdi now for larger systems. > > Based on earlier patches from Nick Piggin and Dave Chinner. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > > Index: linux-2.6/fs/fs-writeback.c > =================================================================== > --- linux-2.6.orig/fs/fs-writeback.c 2011-04-21 08:31:44.512334499 +0200 > +++ linux-2.6/fs/fs-writeback.c 2011-04-21 09:07:05.327511722 +0200 > @@ -180,12 +180,13 @@ void bdi_start_background_writeback(stru > */ > void inode_wb_list_del(struct inode *inode) > { > - spin_lock(&inode_wb_list_lock); > + struct backing_dev_info *bdi = inode_to_bdi(inode); > + > + spin_lock(&bdi->wb.list_lock); > list_del_init(&inode->i_wb_list); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&bdi->wb.list_lock); > } > > - > /* > * Redirty an inode: set its when-it-was dirtied timestamp and move it to the > * furthest end of its superblock's dirty-inode list. > @@ -195,11 +196,9 @@ void inode_wb_list_del(struct inode *ino > * the case then the inode must have been redirtied while it was being written > * out and we don't reset its dirtied_when. > */ > -static void redirty_tail(struct inode *inode) > +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) > { > - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; > - > - assert_spin_locked(&inode_wb_list_lock); > + assert_spin_locked(&wb->list_lock); > if (!list_empty(&wb->b_dirty)) { > struct inode *tail; > > @@ -213,11 +212,9 @@ static void redirty_tail(struct inode *i > /* > * requeue inode for re-scanning after bdi->b_io list is exhausted. > */ > -static void requeue_io(struct inode *inode) > +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) > { > - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; > - > - assert_spin_locked(&inode_wb_list_lock); > + assert_spin_locked(&wb->list_lock); > list_move(&inode->i_wb_list, &wb->b_more_io); > } > > @@ -225,7 +222,7 @@ static void inode_sync_complete(struct i > { > /* > * Prevent speculative execution through > - * spin_unlock(&inode_wb_list_lock); > + * spin_unlock(&wb->list_lock); > */ > > smp_mb(); > @@ -301,7 +298,7 @@ static void move_expired_inodes(struct l > */ > static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) > { > - assert_spin_locked(&inode_wb_list_lock); > + assert_spin_locked(&wb->list_lock); > list_splice_init(&wb->b_more_io, &wb->b_io); > move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); > } > @@ -316,7 +313,8 @@ static int write_inode(struct inode *ino > /* > * Wait for writeback on an inode to complete. > */ > -static void inode_wait_for_writeback(struct inode *inode) > +static void inode_wait_for_writeback(struct inode *inode, > + struct bdi_writeback *wb) > { > DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); > wait_queue_head_t *wqh; > @@ -324,15 +322,15 @@ static void inode_wait_for_writeback(str > wqh = bit_waitqueue(&inode->i_state, __I_SYNC); > while (inode->i_state & I_SYNC) { > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > spin_lock(&inode->i_lock); > } > } > > /* > - * Write out an inode's dirty pages. Called under inode_wb_list_lock and > + * Write out an inode's dirty pages. Called under wb->list_lock and > * inode->i_lock. Either the caller has an active reference on the inode or > * the inode has I_WILL_FREE set. > * > @@ -343,13 +341,14 @@ static void inode_wait_for_writeback(str > * livelocks, etc. > */ > static int > -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, > + struct writeback_control *wbc) > { > struct address_space *mapping = inode->i_mapping; > unsigned dirty; > int ret; > > - assert_spin_locked(&inode_wb_list_lock); > + assert_spin_locked(&wb->list_lock); > assert_spin_locked(&inode->i_lock); > > if (!atomic_read(&inode->i_count)) > @@ -367,14 +366,14 @@ writeback_single_inode(struct inode *ino > * completed a full scan of b_io. > */ > if (wbc->sync_mode != WB_SYNC_ALL) { > - requeue_io(inode); > + requeue_io(inode, wb); > return 0; > } > > /* > * It's a data-integrity sync. We must wait. > */ > - inode_wait_for_writeback(inode); > + inode_wait_for_writeback(inode, wb); > } > > BUG_ON(inode->i_state & I_SYNC); > @@ -383,7 +382,7 @@ writeback_single_inode(struct inode *ino > inode->i_state |= I_SYNC; > inode->i_state &= ~I_DIRTY_PAGES; > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > > ret = do_writepages(mapping, wbc); > > @@ -414,7 +413,7 @@ writeback_single_inode(struct inode *ino > ret = err; > } > > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > spin_lock(&inode->i_lock); > inode->i_state &= ~I_SYNC; > if (!(inode->i_state & I_FREEING)) { > @@ -428,7 +427,7 @@ writeback_single_inode(struct inode *ino > /* > * slice used up: queue for next turn > */ > - requeue_io(inode); > + requeue_io(inode, wb); > } else { > /* > * Writeback blocked by something other than > @@ -437,7 +436,7 @@ writeback_single_inode(struct inode *ino > * retrying writeback of the dirty page/inode > * that cannot be performed immediately. > */ > - redirty_tail(inode); > + redirty_tail(inode, wb); > } > } else if (inode->i_state & I_DIRTY) { > /* > @@ -446,7 +445,7 @@ writeback_single_inode(struct inode *ino > * submission or metadata updates after data IO > * completion. > */ > - redirty_tail(inode); > + redirty_tail(inode, wb); > } else { > /* > * The inode is clean. At this point we either have > @@ -510,7 +509,7 @@ static int writeback_sb_inodes(struct su > * superblock, move all inodes not belonging > * to it back onto the dirty list. > */ > - redirty_tail(inode); > + redirty_tail(inode, wb); > continue; > } > > @@ -530,7 +529,7 @@ static int writeback_sb_inodes(struct su > spin_lock(&inode->i_lock); > if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { > spin_unlock(&inode->i_lock); > - requeue_io(inode); > + requeue_io(inode, wb); > continue; > } > > @@ -546,19 +545,19 @@ static int writeback_sb_inodes(struct su > __iget(inode); > > pages_skipped = wbc->pages_skipped; > - writeback_single_inode(inode, wbc); > + writeback_single_inode(inode, wb, wbc); > if (wbc->pages_skipped != pages_skipped) { > /* > * writeback is not making progress due to locked > * buffers. Skip this inode for now. > */ > - redirty_tail(inode); > + redirty_tail(inode, wb); > } > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > iput(inode); > cond_resched(); > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > if (wbc->nr_to_write <= 0) { > wbc->more_io = 1; > return 1; > @@ -577,7 +576,7 @@ void writeback_inodes_wb(struct bdi_writ > > if (!wbc->wb_start) > wbc->wb_start = jiffies; /* livelock avoidance */ > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > if (!wbc->for_kupdate || list_empty(&wb->b_io)) > queue_io(wb, wbc->older_than_this); > > @@ -586,7 +585,7 @@ void writeback_inodes_wb(struct bdi_writ > struct super_block *sb = inode->i_sb; > > if (!pin_sb_for_writeback(sb)) { > - requeue_io(inode); > + requeue_io(inode, wb); > continue; > } > ret = writeback_sb_inodes(sb, wb, wbc, false); > @@ -595,7 +594,7 @@ void writeback_inodes_wb(struct bdi_writ > if (ret) > break; > } > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > /* Leave any unwritten inodes on b_io */ > } > > @@ -604,11 +603,11 @@ static void __writeback_inodes_sb(struct > { > WARN_ON(!rwsem_is_locked(&sb->s_umount)); > > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > if (!wbc->for_kupdate || list_empty(&wb->b_io)) > queue_io(wb, wbc->older_than_this); > writeback_sb_inodes(sb, wb, wbc, true); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > } > > /* > @@ -747,15 +746,15 @@ static long wb_writeback(struct bdi_writ > * become available for writeback. Otherwise > * we'll just busyloop. > */ > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > if (!list_empty(&wb->b_more_io)) { > inode = wb_inode(wb->b_more_io.prev); > trace_wbc_writeback_wait(&wbc, wb->bdi); > spin_lock(&inode->i_lock); > - inode_wait_for_writeback(inode); > + inode_wait_for_writeback(inode, wb); > spin_unlock(&inode->i_lock); > } > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > } > > return wrote; > @@ -1092,10 +1091,10 @@ void __mark_inode_dirty(struct inode *in > } > > spin_unlock(&inode->i_lock); > - spin_lock(&inode_wb_list_lock); > + spin_lock(&bdi->wb.list_lock); > inode->dirtied_when = jiffies; > list_move(&inode->i_wb_list, &bdi->wb.b_dirty); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&bdi->wb.list_lock); > > if (wakeup_bdi) > bdi_wakeup_thread_delayed(bdi); > @@ -1296,6 +1295,7 @@ EXPORT_SYMBOL(sync_inodes_sb); > */ > int write_inode_now(struct inode *inode, int sync) > { > + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; > int ret; > struct writeback_control wbc = { > .nr_to_write = LONG_MAX, > @@ -1308,11 +1308,11 @@ int write_inode_now(struct inode *inode, > wbc.nr_to_write = 0; > > might_sleep(); > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > spin_lock(&inode->i_lock); > - ret = writeback_single_inode(inode, &wbc); > + ret = writeback_single_inode(inode, wb, &wbc); > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > if (sync) > inode_sync_wait(inode); > return ret; > @@ -1332,13 +1332,14 @@ EXPORT_SYMBOL(write_inode_now); > */ > int sync_inode(struct inode *inode, struct writeback_control *wbc) > { > + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; > int ret; > > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > spin_lock(&inode->i_lock); > - ret = writeback_single_inode(inode, wbc); > + ret = writeback_single_inode(inode, wb, wbc); > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > return ret; > } > EXPORT_SYMBOL(sync_inode); > Index: linux-2.6/fs/inode.c > =================================================================== > --- linux-2.6.orig/fs/inode.c 2011-04-21 08:31:40.172358011 +0200 > +++ linux-2.6/fs/inode.c 2011-04-21 09:07:05.327511722 +0200 > @@ -37,7 +37,7 @@ > * inode_lru, inode->i_lru > * inode_sb_list_lock protects: > * sb->s_inodes, inode->i_sb_list > - * inode_wb_list_lock protects: > + * bdi->wb.list_lock protects: > * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list > * inode_hash_lock protects: > * inode_hashtable, inode->i_hash > @@ -48,7 +48,7 @@ > * inode->i_lock > * inode_lru_lock > * > - * inode_wb_list_lock > + * bdi->wb.list_lock > * inode->i_lock > * > * inode_hash_lock > @@ -111,7 +111,6 @@ static LIST_HEAD(inode_lru); > static DEFINE_SPINLOCK(inode_lru_lock); > > __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); > -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); > > /* > * iprune_sem provides exclusion between the icache shrinking and the > Index: linux-2.6/include/linux/writeback.h > =================================================================== > --- linux-2.6.orig/include/linux/writeback.h 2011-04-21 08:31:42.185680435 +0200 > +++ linux-2.6/include/linux/writeback.h 2011-04-21 09:07:05.327511722 +0200 > @@ -9,8 +9,6 @@ > > struct backing_dev_info; > > -extern spinlock_t inode_wb_list_lock; > - > /* > * fs/fs-writeback.c > */ > Index: linux-2.6/mm/backing-dev.c > =================================================================== > --- linux-2.6.orig/mm/backing-dev.c 2011-04-21 08:31:44.532334389 +0200 > +++ linux-2.6/mm/backing-dev.c 2011-04-21 09:07:05.327511722 +0200 > @@ -45,6 +45,17 @@ static struct timer_list sync_supers_tim > static int bdi_sync_supers(void *); > static void sync_supers_timer_fn(unsigned long); > > +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) > +{ > + if (wb1 < wb2) { > + spin_lock(&wb1->list_lock); > + spin_lock_nested(&wb2->list_lock, 1); > + } else { > + spin_lock(&wb2->list_lock); > + spin_lock_nested(&wb1->list_lock, 1); > + } > +} > + > #ifdef CONFIG_DEBUG_FS > #include <linux/debugfs.h> > #include <linux/seq_file.h> > @@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct s > struct inode *inode; > > nr_wb = nr_dirty = nr_io = nr_more_io = 0; > - spin_lock(&inode_wb_list_lock); > + spin_lock(&wb->list_lock); > list_for_each_entry(inode, &wb->b_dirty, i_wb_list) > nr_dirty++; > list_for_each_entry(inode, &wb->b_io, i_wb_list) > nr_io++; > list_for_each_entry(inode, &wb->b_more_io, i_wb_list) > nr_more_io++; > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&wb->list_lock); > > global_dirty_limits(&background_thresh, &dirty_thresh); > bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); > @@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_write > INIT_LIST_HEAD(&wb->b_dirty); > INIT_LIST_HEAD(&wb->b_io); > INIT_LIST_HEAD(&wb->b_more_io); > + spin_lock_init(&wb->list_lock); > setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); > } > > @@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info > if (bdi_has_dirty_io(bdi)) { > struct bdi_writeback *dst = &default_backing_dev_info.wb; > > - spin_lock(&inode_wb_list_lock); > + bdi_lock_two(&bdi->wb, dst); > list_splice(&bdi->wb.b_dirty, &dst->b_dirty); > list_splice(&bdi->wb.b_io, &dst->b_io); > list_splice(&bdi->wb.b_more_io, &dst->b_more_io); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&bdi->wb.list_lock); > + spin_unlock(&dst->list_lock); > } > > bdi_unregister(bdi); > Index: linux-2.6/mm/filemap.c > =================================================================== > --- linux-2.6.orig/mm/filemap.c 2011-04-21 08:31:42.159013915 +0200 > +++ linux-2.6/mm/filemap.c 2011-04-21 09:07:05.330845037 +0200 > @@ -80,7 +80,7 @@ > * ->i_mutex > * ->i_alloc_sem (various) > * > - * inode_wb_list_lock > + * bdi->wb.list_lock > * sb_lock (fs/fs-writeback.c) > * ->mapping->tree_lock (__sync_single_inode) > * > @@ -98,9 +98,9 @@ > * ->zone.lru_lock (check_pte_range->isolate_lru_page) > * ->private_lock (page_remove_rmap->set_page_dirty) > * ->tree_lock (page_remove_rmap->set_page_dirty) > - * inode_wb_list_lock (page_remove_rmap->set_page_dirty) > + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) > * ->inode->i_lock (page_remove_rmap->set_page_dirty) > - * inode_wb_list_lock (zap_pte_range->set_page_dirty) > + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) > * ->inode->i_lock (zap_pte_range->set_page_dirty) > * ->private_lock (zap_pte_range->__set_page_dirty_buffers) > * > Index: linux-2.6/mm/rmap.c > =================================================================== > --- linux-2.6.orig/mm/rmap.c 2011-04-21 08:31:41.519017382 +0200 > +++ linux-2.6/mm/rmap.c 2011-04-21 09:07:05.330845037 +0200 > @@ -32,11 +32,11 @@ > * mmlist_lock (in mmput, drain_mmlist and others) > * mapping->private_lock (in __set_page_dirty_buffers) > * inode->i_lock (in set_page_dirty's __mark_inode_dirty) > - * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) > + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) > * sb_lock (within inode_lock in fs/fs-writeback.c) > * mapping->tree_lock (widely used, in set_page_dirty, > * in arch-dependent flush_dcache_mmap_lock, > - * within inode_wb_list_lock in __sync_single_inode) > + * within bdi.wb->list_lock in __sync_single_inode) > * > * (code doesn't rely on that order so it could be switched around) > * ->tasklist_lock > Index: linux-2.6/fs/block_dev.c > =================================================================== > --- linux-2.6.orig/fs/block_dev.c 2011-04-21 08:31:44.522334444 +0200 > +++ linux-2.6/fs/block_dev.c 2011-04-21 09:07:05.330845037 +0200 > @@ -55,13 +55,16 @@ EXPORT_SYMBOL(I_BDEV); > static void bdev_inode_switch_bdi(struct inode *inode, > struct backing_dev_info *dst) > { > - spin_lock(&inode_wb_list_lock); > + struct backing_dev_info *old = inode->i_data.backing_dev_info; > + > + bdi_lock_two(&old->wb, &dst->wb); > spin_lock(&inode->i_lock); > inode->i_data.backing_dev_info = dst; > if (inode->i_state & I_DIRTY) > list_move(&inode->i_wb_list, &dst->wb.b_dirty); > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_wb_list_lock); > + spin_unlock(&old->wb.list_lock); > + spin_unlock(&dst->wb.list_lock); > } > > static sector_t max_block(struct block_device *bdev) > Index: linux-2.6/include/linux/backing-dev.h > =================================================================== > --- linux-2.6.orig/include/linux/backing-dev.h 2011-04-21 08:31:42.202347013 +0200 > +++ linux-2.6/include/linux/backing-dev.h 2011-04-21 09:07:05.330845037 +0200 > @@ -57,6 +57,7 @@ struct bdi_writeback { > struct list_head b_dirty; /* dirty inodes */ > struct list_head b_io; /* parked for writeback */ > struct list_head b_more_io; /* parked for more writeback */ > + spinlock_t list_lock; /* protects the b_* lists. */ > }; > > struct backing_dev_info { > @@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data); > int bdi_has_dirty_io(struct backing_dev_info *bdi); > void bdi_arm_supers_timer(void); > void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); > +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); > > extern spinlock_t bdi_lock; > extern struct list_head bdi_list; -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html