fs: scale files_lock Improve scalability of files_lock by adding per-cpu, per-sb files lists, protected with per-cpu locking. Effectively turning it into a big-writer lock. One difficulty with this approach is that a file can be removed from the list by another CPU. We must track which per-cpu list the file is on. Scalability could suffer if files are frequently removed from different cpu's list. However loads with frequent removal of files imply short interval between adding and removing the files, and the scheduler attempts to avoid moving processes too far away. Also, even in the case of cross-CPU removal, the hardware has much more opportunity to parallelise cacheline transfers with N cachelines than with 1. A worst-case test of 1 CPU allocating files subsequently being freed by N CPUs degenerates to contending on a single lock, which is no worse than before. When more than one CPU are allocating files, even if they are always freed by different CPUs, there will be more parallelism than the single-lock case. Testing results on a 2 socket, 8 core opteron, I measure the number of times the lock is taken to remove the file, the number of times it is removed by the same CPU that added it, and the number of times it is removed by the same node that added it. Booting: locks=25049 cpu-hits=23174 (92.5%) node-hits=23945 (95.6%) kbuild -j16 locks=2281913 cpu-hits=2208126 (96.8%) node-hits=2252674 (98.7%) dbench 64 locks=4306582 cpu-hits=4287247 (99.6%) node-hits=4299527 (99.8%) Signed-off-by: Nick Piggin <npiggin@xxxxxxx> --- fs/file_table.c | 155 ++++++++++++++++++++++++++++++++++++++--------------- fs/super.c | 18 ++++++ include/linux/fs.h | 7 ++ 3 files changed, 139 insertions(+), 41 deletions(-) Index: linux-2.6/fs/file_table.c =================================================================== --- linux-2.6.orig/fs/file_table.c +++ linux-2.6/fs/file_table.c @@ -21,6 +21,7 @@ #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> +#include <linux/percpu.h> #include <linux/ima.h> #include <asm/atomic.h> @@ -32,7 +33,7 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +static DEFINE_PER_CPU(spinlock_t, files_cpulock); /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; @@ -330,42 +331,101 @@ void put_filp(struct file *file) void file_sb_list_add(struct file *file, struct super_block *sb) { - spin_lock(&files_lock); + spinlock_t *lock; + struct list_head *list; +#ifdef CONFIG_SMP + int cpu; +#endif + + lock = &get_cpu_var(files_cpulock); +#ifdef CONFIG_SMP + cpu = smp_processor_id(); + list = per_cpu_ptr(sb->s_files, cpu); + file->f_sb_list_cpu = cpu; +#else + list = &sb->s_files; +#endif + spin_lock(lock); BUG_ON(!list_empty(&file->f_u.fu_list)); - list_add(&file->f_u.fu_list, &sb->s_files); - spin_unlock(&files_lock); + list_add(&file->f_u.fu_list, list); + spin_unlock(lock); + put_cpu_var(files_cpulock); } void file_sb_list_del(struct file *file) { if (!list_empty(&file->f_u.fu_list)) { - spin_lock(&files_lock); + spinlock_t *lock; + +#ifdef CONFIG_SMP + lock = &per_cpu(files_cpulock, file->f_sb_list_cpu); +#else + lock = &__get_cpu_var(files_cpulock); +#endif + spin_lock(lock); list_del_init(&file->f_u.fu_list); - spin_unlock(&files_lock); + spin_unlock(lock); + } +} + +static void file_list_lock_all(void) +{ + int i; + int nr = 0; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = &per_cpu(files_cpulock, i); + spin_lock_nested(lock, nr); + nr++; + } +} + +static void file_list_unlock_all(void) +{ + int i; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = &per_cpu(files_cpulock, i); + spin_unlock(lock); } } int fs_may_remount_ro(struct super_block *sb) { - struct file *file; + int i; /* Check that no files are currently opened for writing. */ - spin_lock(&files_lock); - list_for_each_entry(file, &sb->s_files, f_u.fu_list) { - struct inode *inode = file->f_path.dentry->d_inode; - - /* File with pending delete? */ - if (inode->i_nlink == 0) - goto too_bad; - - /* Writeable file? */ - if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) - goto too_bad; + file_list_lock_all(); + for_each_possible_cpu(i) { + struct file *file; + struct list_head *list; + +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_files, i); +#else + list = &sb->s_files; +#endif + list_for_each_entry(file, list, f_u.fu_list) { + struct inode *inode = file->f_path.dentry->d_inode; + + /* File with pending delete? */ + if (inode->i_nlink == 0) + goto too_bad; + + /* Writeable file? */ + if (S_ISREG(inode->i_mode) && + (file->f_mode & FMODE_WRITE)) + goto too_bad; + } } - spin_unlock(&files_lock); + file_list_unlock_all(); return 1; /* Tis' cool bro. */ too_bad: - spin_unlock(&files_lock); + file_list_unlock_all(); return 0; } @@ -378,37 +438,48 @@ too_bad: */ void mark_files_ro(struct super_block *sb) { - struct file *f; + int i; retry: - spin_lock(&files_lock); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - struct vfsmount *mnt; - if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) - continue; - if (!file_count(f)) - continue; - if (!(f->f_mode & FMODE_WRITE)) - continue; - spin_lock(&f->f_lock); - f->f_mode &= ~FMODE_WRITE; - spin_unlock(&f->f_lock); - if (file_check_writeable(f) != 0) - continue; - file_release_write(f); - mnt = mntget(f->f_path.mnt); - /* This can sleep, so we can't hold the spinlock. */ - spin_unlock(&files_lock); - mnt_drop_write(mnt); - mntput(mnt); - goto retry; + file_list_lock_all(); + for_each_possible_cpu(i) { + struct file *f; + struct list_head *list; + +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_files, i); +#else + list = &sb->s_files; +#endif + list_for_each_entry(f, list, f_u.fu_list) { + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + spin_lock(&f->f_lock); + f->f_mode &= ~FMODE_WRITE; + spin_unlock(&f->f_lock); + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + /* This can sleep, so we can't hold the spinlock. */ + file_list_unlock_all(); + mnt_drop_write(mnt); + mntput(mnt); + goto retry; + } } - spin_unlock(&files_lock); + file_list_unlock_all(); } void __init files_init(unsigned long mempages) { int n; + int i; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -423,5 +494,7 @@ void __init files_init(unsigned long mem if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(files_cpulock, i)); percpu_counter_init(&nr_files, 0); } Index: linux-2.6/fs/super.c =================================================================== --- linux-2.6.orig/fs/super.c +++ linux-2.6/fs/super.c @@ -62,7 +62,22 @@ static struct super_block *alloc_super(s s = NULL; goto out; } +#ifdef CONFIG_SMP + s->s_files = alloc_percpu(struct list_head); + if (!s->s_files) { + security_sb_free(s); + kfree(s); + s = NULL; + goto out; + } else { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i)); + } +#else INIT_LIST_HEAD(&s->s_files); +#endif INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -117,6 +132,9 @@ out: */ static inline void destroy_super(struct super_block *s) { +#ifdef CONFIG_SMP + free_percpu(s->s_files); +#endif security_sb_free(s); kfree(s->s_subtype); kfree(s->s_options); Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h +++ linux-2.6/include/linux/fs.h @@ -924,6 +924,9 @@ struct file { #define f_vfsmnt f_path.mnt const struct file_operations *f_op; spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ +#ifdef CONFIG_SMP + int f_sb_list_cpu; +#endif atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -1340,7 +1343,11 @@ struct super_block { struct list_head s_inodes; /* all inodes */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ +#ifdef CONFIG_SMP + struct list_head *s_files; +#else struct list_head s_files; +#endif /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html