This uses a statically-allocated percpu variable. All operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two different cpus because the task moved. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. --- A little history: These patches used to use a single atomic_t in each vfsmount to track the number of writers to the mount at any given time. Open a file for write anywhere on the mount, and you increment the atomic_t. This didn't scale on NUMA or SMP. It caused something like a 4x slowdown in open/write/close operations. It bounced cachelines around like mad. I tried out a new system with a per-node spinlock and atomic in each vfsmount to replace the old single atomic. It worked _much_ better on NUMA, but still caused a ~6% regression on the same open/write/close operation set on a normal 4-way SMP machine, because it was still contended inside of a node. We could generalize this lock, but this is an awfully specialized situation, and I'd be worried that people would try to use it when it isn't absolutely necessary. Signed-off-by: Dave Hansen <haveblue@xxxxxxxxxx> --- lxc-dave/fs/namei.c | 6 + lxc-dave/fs/namespace.c | 140 ++++++++++++++++++++++++++++++++++++++++- lxc-dave/include/linux/mount.h | 9 ++ 3 files changed, 150 insertions(+), 5 deletions(-) diff -puN fs/namei.c~numa_mnt_want_write fs/namei.c --- lxc/fs/namei.c~numa_mnt_want_write 2007-06-22 10:12:46.000000000 -0700 +++ lxc-dave/fs/namei.c 2007-06-22 10:12:46.000000000 -0700 @@ -231,10 +231,12 @@ int permission(struct inode *inode, int int retval, submask; if (mask & MAY_WRITE) { - /* - * Nobody gets write access to a read-only fs. + * If this WARN_ON() is hit, it likely means that + * there was a missed mnt_want_write() on the path + * leading here. */ + WARN_ON(__mnt_is_readonly(nd->mnt)); if (IS_RDONLY(inode) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; diff -puN fs/namespace.c~numa_mnt_want_write fs/namespace.c --- lxc/fs/namespace.c~numa_mnt_want_write 2007-06-22 10:12:46.000000000 -0700 +++ lxc-dave/fs/namespace.c 2007-06-22 10:21:39.000000000 -0700 @@ -17,6 +17,7 @@ #include <linux/quotaops.h> #include <linux/acct.h> #include <linux/capability.h> +#include <linux/cpumask.h> #include <linux/module.h> #include <linux/sysfs.h> #include <linux/seq_file.h> @@ -51,6 +52,8 @@ static inline unsigned long hash(struct return tmp & hash_mask; } +#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -64,6 +67,7 @@ struct vfsmount *alloc_vfsmnt(const char INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + atomic_set(&mnt->__mnt_writers, 0); if (name) { int size = strlen(name) + 1; char *newname = kmalloc(size, GFP_KERNEL); @@ -76,19 +80,149 @@ struct vfsmount *alloc_vfsmnt(const char return mnt; } -int mnt_want_write(struct vfsmount *mnt) + +struct mnt_writer { + /* + * If holding multiple instances of this lock, they + * must be ordered by cpu number. + */ + spinlock_t lock; + unsigned long count; + struct vfsmount *mnt; +} ____cacheline_aligned_in_smp; +DEFINE_PER_CPU(struct mnt_writer, mnt_writers); + +static int __init init_mnt_writers(void) { - if (__mnt_is_readonly(mnt)) - return -EROFS; + int cpu; + for_each_possible_cpu(cpu) { + struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); + spin_lock_init(&writer->lock); + writer->count = 0; + } return 0; } +fs_initcall(init_mnt_writers); + +static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) +{ + if (!cpu_writer->mnt) + return; + atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); + cpu_writer->count = 0; +} + +int mnt_want_write(struct vfsmount *mnt) +{ + int ret = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + if (__mnt_is_readonly(mnt)) { + ret = -EROFS; + goto out; + } + if (cpu_writer->mnt != mnt) { + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = mnt; + } + cpu_writer->count++; +out: + spin_unlock(&cpu_writer->lock); + put_cpu_var(mnt_writers); + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write); +static void lock_and_coalesce_cpu_mnt_writer_counts(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_lock(&cpu_writer->lock); + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = NULL; + } +} + +static void mnt_unlock_cpus(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_unlock(&cpu_writer->lock); + } +} + +/* + * These per-cpu write counts are not guaranteed to have + * matched increments and decrements on any given cpu. + * A file open()ed for write on one cpu and close()d on + * another cpu will imbalance this count. Make sure it + * does not get too far out of whack. + */ +static void handle_write_count_underflow(struct vfsmount *mnt) +{ + /* + * This should be fast the vast majority + * of the time because everyone will only + * be reading it and will share cachelines. + */ + while (atomic_read(&mnt->__mnt_writers) < + MNT_WRITER_UNDERFLOW_LIMIT) { + /* + * It isn't necessary to hold all of the locks + * at the same time, but doing it this way makes + * us share a lot more code. + */ + lock_and_coalesce_cpu_mnt_writer_counts(); + mnt_unlock_cpus(); + } +} + void mnt_drop_write(struct vfsmount *mnt) { + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + put_cpu_var(mnt_writers); + if (cpu_writer->count > 0) + cpu_writer->count--; + else + atomic_dec(&mnt->__mnt_writers); + spin_unlock(&cpu_writer->lock); + handle_write_count_underflow(mnt); } EXPORT_SYMBOL_GPL(mnt_drop_write); +int mnt_make_readonly(struct vfsmount *mnt) +{ + int ret = 0; + + lock_and_coalesce_cpu_mnt_writer_counts(); + /* + * With all the locks held, this value is stable + */ + if (atomic_read(&mnt->__mnt_writers) > 0) { + ret = -EBUSY; + goto out; + } + /* + * actually set mount's r/o flag here to make + * __mnt_is_readonly() true, which keeps anyone + * from doing a successful mnt_want_write(). + */ +out: + mnt_unlock_cpus(); + return ret; +} + int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; diff -puN include/linux/mount.h~numa_mnt_want_write include/linux/mount.h --- lxc/include/linux/mount.h~numa_mnt_want_write 2007-06-22 10:12:46.000000000 -0700 +++ lxc-dave/include/linux/mount.h 2007-06-22 10:21:39.000000000 -0700 @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/list.h> +#include <linux/nodemask.h> #include <linux/spinlock.h> #include <asm/atomic.h> @@ -61,6 +62,14 @@ struct vfsmount { atomic_t mnt_count; int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; + + /* + * This value is not stable unless all of the + * mnt_writers[] spinlocks are held, and all + * mnt_writer[]s on this mount have 0 as + * their ->count + */ + atomic_t __mnt_writers; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) _ - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html