When many threads are trying to add or delete inode to or from a superblock's s_inodes list, spinlock contention on the list can become a performance bottleneck. This patch changes the s_inodes field to become a dlock list which is a distributed set of lists with per-list spinlocks. As a result, the following superblock inode list (sb->s_inodes) iteration functions in vfs are also being modified: 1. iterate_bdevs() 2. drop_pagecache_sb() 3. evict_inodes() 4. invalidate_inodes() 5. fsnotify_unmount_inodes() 6. add_dquot_ref() 7. remove_dquot_ref() With an exit microbenchmark that creates a large number of threads, attachs many inodes to them and then exits. The runtimes of that microbenchmark with 1200 threads before and after the patch on a 2-socket Intel E5-2699 v3 system (36 cores, 72 threads) were as follows: Kernel Elapsed Time System Time ------ ------------ ----------- Vanilla 4.14-rc3 58.40s 65m20s Patched 4.14-rc3 42.17s 45m32s Before the patch, spinlock contention at the inode_sb_list_add() function at the startup phase and the evict() function at the teardown phase were about 68% and 97% of total CPU time respectively (as measured by perf). With the patched kernel, the inode_sb_list_add() function is less than 1% and the evict() function is at 2.5% respectively. There were still quite a lot of lock contention elsewhere that limited the gain. For example, the nlru lock was contributing most of the lock contention in the teardown phase. Signed-off-by: Waiman Long <longman@xxxxxxxxxx> Reviewed-by: Jan Kara <jack@xxxxxxx> --- fs/block_dev.c | 9 ++++----- fs/drop_caches.c | 9 ++++----- fs/inode.c | 34 +++++++++++++--------------------- fs/notify/fsnotify.c | 9 ++++----- fs/quota/dquot.c | 14 ++++++-------- fs/super.c | 7 ++++--- include/linux/fs.h | 8 ++++---- 7 files changed, 39 insertions(+), 51 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 789f55e..9bbeb14 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -2127,9 +2127,9 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes); - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; @@ -2141,7 +2141,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&blockdev_superblock->s_inode_list_lock); + dlock_list_unlock(&iter); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the @@ -2159,8 +2159,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(bdev, arg); mutex_unlock(&bdev->bd_mutex); - spin_lock(&blockdev_superblock->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d72d52b..8db1374 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -16,9 +16,9 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || (inode->i_mapping->nrpages == 0)) { @@ -27,15 +27,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/inode.c b/fs/inode.c index a9a38c0..04bcc9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,7 +28,7 @@ * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes->head->lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list @@ -37,7 +37,7 @@ * * Lock ordering: * - * inode->i_sb->s_inode_list_lock + * inode->i_sb->s_inodes->head->lock * inode->i_lock * Inode LRU list locks * @@ -45,7 +45,7 @@ * inode->i_lock * * inode_hash_lock - * inode->i_sb->s_inode_list_lock + * inode->i_sb->s_inodes->head->lock * inode->i_lock * * iunique_lock @@ -434,19 +434,14 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + dlock_lists_add(&inode->i_sb_list, &inode->i_sb->s_inodes); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); - } + if (!list_empty(&inode->i_sb_list.list)) + dlock_lists_del(&inode->i_sb_list); } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -602,11 +597,12 @@ static void dispose_list(struct list_head *head) void evict_inodes(struct super_block *sb) { struct inode *inode; + struct dlock_list_iter iter; LIST_HEAD(dispose); again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + init_dlock_list_iter(&iter, &sb->s_inodes); + dlist_for_each_entry(inode, &iter, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -627,13 +623,12 @@ void evict_inodes(struct super_block *sb) * bit so we don't livelock. */ if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); cond_resched(); dispose_list(&dispose); goto again; } } - spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -654,9 +649,9 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) int busy = 0; struct inode *inode; LIST_HEAD(dispose); + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); @@ -678,7 +673,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); @@ -893,7 +887,7 @@ struct inode *new_inode_pseudo(struct super_block *sb) spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); - INIT_LIST_HEAD(&inode->i_sb_list); + init_dlock_list_node(&inode->i_sb_list); } return inode; } @@ -914,8 +908,6 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&sb->s_inode_list_lock); - inode = new_inode_pseudo(sb); if (inode) inode_sb_list_add(inode); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0c4583b..136a68a 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -51,9 +51,9 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point @@ -78,7 +78,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); if (iput_inode) iput(iput_inode); @@ -90,9 +90,8 @@ void fsnotify_unmount_inodes(struct super_block *sb) iput_inode = inode; - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); if (iput_inode) iput(iput_inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 52ad151..30e0211 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -936,12 +936,12 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || !atomic_read(&inode->i_writecount) || @@ -951,7 +951,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -969,9 +969,8 @@ static void add_dquot_ref(struct super_block *sb, int type) * later. */ old_inode = inode; - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1039,9 +1038,9 @@ static void remove_dquot_ref(struct super_block *sb, int type, { struct inode *inode; int reserved = 0; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch @@ -1056,7 +1055,6 @@ static void remove_dquot_ref(struct super_block *sb, int type, } spin_unlock(&dq_data_lock); } - spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" diff --git a/fs/super.c b/fs/super.c index 166c4ee..a90a070 100644 --- a/fs/super.c +++ b/fs/super.c @@ -164,6 +164,7 @@ static void destroy_super(struct super_block *s) { list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); + free_dlock_list_heads(&s->s_inodes); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); put_user_ns(s->s_user_ns); @@ -210,11 +211,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); mutex_init(&s->s_sync_lock); - INIT_LIST_HEAD(&s->s_inodes); - spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); + if (alloc_dlock_list_heads(&s->s_inodes)) + goto fail; if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru)) @@ -434,7 +435,7 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb); - if (!list_empty(&sb->s_inodes)) { + if (!dlock_lists_empty(&sb->s_inodes)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); diff --git a/include/linux/fs.h b/include/linux/fs.h index 13dab19..d687ad8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,6 +34,7 @@ #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> +#include <linux/dlock-list.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -632,7 +633,7 @@ struct inode { u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ - struct list_head i_sb_list; + struct dlock_list_node i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; @@ -1434,9 +1435,8 @@ struct super_block { */ int s_stack_depth; - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ + /* The internal per-list locks protect s_inodes */ + struct dlock_list_heads s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ -- 1.8.3.1