[PATCH 25/26] r/o bind mounts: scalable writer count

Dave Hansen <haveblue@xxxxxxxxxx> · Fri, 22 Jun 2007 13:03:36 -0700

This uses a statically-allocated percpu variable.  All
operations are local to a cpu as long that cpu operates on
the same mount, and there are no writer count imbalances.
Writer count imbalances happen when a write is taken on one
cpu, and released on another, like when an open/close pair
is performed on two different cpus because the task moved.

I've written a little benchmark to sit in a loop for a couple
of seconds in several cpus in parallel doing open/write/close
loops.

http://sr71.net/~dave/linux/openbench.c

The code in here is a a worst-possible case for this patch.
It does opens on a _pair_ of files in two different mounts
in parallel.  This should cause my code to lose its "operate
on the same mount" optimization completely.  This worst-case
scenario causes a 3% degredation in the benchmark.

I could probably get rid of even this 3%, but it would be
more complex than what I have here, and I think this is
getting into acceptable territory.  In practice, I expect
writing more than 3 bytes to a file, as well as disk I/O
to mask any effects that this has.

---

A little history:

These patches used to use a single atomic_t in each vfsmount
to track the number of writers to the mount at any given time.
Open a file for write anywhere on the mount, and you increment
the atomic_t.

This didn't scale on NUMA or SMP.  It caused something like a
4x slowdown in open/write/close operations.  It bounced
cachelines around like mad.  I tried out a new system with a
per-node spinlock and atomic in each vfsmount to replace the
old single atomic.  It worked _much_ better on NUMA, but still
caused a ~6% regression on the same open/write/close operation
set on a normal 4-way SMP machine, because it was still
contended inside of a node.

We could generalize this lock, but this is an awfully specialized
situation, and I'd be worried that people would try to use it
when it isn't absolutely necessary.

Signed-off-by: Dave Hansen <haveblue@xxxxxxxxxx>
---

 lxc-dave/fs/namei.c            |    6 +
 lxc-dave/fs/namespace.c        |  140 ++++++++++++++++++++++++++++++++++++++++-
 lxc-dave/include/linux/mount.h |    9 ++
 3 files changed, 150 insertions(+), 5 deletions(-)

diff -puN fs/namei.c~numa_mnt_want_write fs/namei.c

--- lxc/fs/namei.c~numa_mnt_want_write	2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/fs/namei.c	2007-06-22 10:12:46.000000000 -0700
@@ -231,10 +231,12 @@ int permission(struct inode *inode, int 
 	int retval, submask;
 
 	if (mask & MAY_WRITE) {
-
 		/*
-		 * Nobody gets write access to a read-only fs.
+		 * If this WARN_ON() is hit, it likely means that
+		 * there was a missed mnt_want_write() on the path
+		 * leading here.
 		 */
+		WARN_ON(__mnt_is_readonly(nd->mnt));
 		if (IS_RDONLY(inode) &&
 		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 			return -EROFS;
diff -puN fs/namespace.c~numa_mnt_want_write fs/namespace.c
--- lxc/fs/namespace.c~numa_mnt_want_write	2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/fs/namespace.c	2007-06-22 10:21:39.000000000 -0700
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/sysfs.h>
 #include <linux/seq_file.h>
@@ -51,6 +52,8 @@ static inline unsigned long hash(struct 
 	return tmp & hash_mask;
 }
 
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -64,6 +67,7 @@ struct vfsmount *alloc_vfsmnt(const char
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
+		atomic_set(&mnt->__mnt_writers, 0);
 		if (name) {
 			int size = strlen(name) + 1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -76,19 +80,149 @@ struct vfsmount *alloc_vfsmnt(const char
 	return mnt;
 }
 
-int mnt_want_write(struct vfsmount *mnt)
+
+struct mnt_writer {
+	/*
+	 * If holding multiple instances of this lock, they
+	 * must be ordered by cpu number.
+	 */
+	spinlock_t lock;
+	unsigned long count;
+	struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
 {
-	if (__mnt_is_readonly(mnt))
-		return -EROFS;
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+		spin_lock_init(&writer->lock);
+		writer->count = 0;
+	}
 	return 0;
 }
+fs_initcall(init_mnt_writers);
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+	if (!cpu_writer->mnt)
+		return;
+	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+	cpu_writer->count = 0;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+	int ret = 0;
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+	if (__mnt_is_readonly(mnt)) {
+		ret = -EROFS;
+		goto out;
+	}
+	if (cpu_writer->mnt != mnt) {
+		__clear_mnt_count(cpu_writer);
+		cpu_writer->mnt = mnt;
+	}
+	cpu_writer->count++;
+out:
+	spin_unlock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
+	return ret;
+}
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
+static void lock_and_coalesce_cpu_mnt_writer_counts(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_lock(&cpu_writer->lock);
+		__clear_mnt_count(cpu_writer);
+		cpu_writer->mnt = NULL;
+	}
+}
+
+static void mnt_unlock_cpus(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_unlock(&cpu_writer->lock);
+	}
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+	/*
+	 * This should be fast the vast majority
+	 * of the time because everyone will only
+	 * be reading it and will share cachelines.
+	 */
+	while (atomic_read(&mnt->__mnt_writers) <
+		MNT_WRITER_UNDERFLOW_LIMIT) {
+		/*
+		 * It isn't necessary to hold all of the locks
+		 * at the same time, but doing it this way makes
+		 * us share a lot more code.
+		 */
+		lock_and_coalesce_cpu_mnt_writer_counts();
+		mnt_unlock_cpus();
+	}
+}
+
 void mnt_drop_write(struct vfsmount *mnt)
 {
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
+	if (cpu_writer->count > 0)
+		cpu_writer->count--;
+	else
+		atomic_dec(&mnt->__mnt_writers);
+	spin_unlock(&cpu_writer->lock);
+	handle_write_count_underflow(mnt);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	lock_and_coalesce_cpu_mnt_writer_counts();
+	/*
+	 * With all the locks held, this value is stable
+	 */
+	if (atomic_read(&mnt->__mnt_writers) > 0) {
+		ret = -EBUSY;
+		goto out;
+	}
+	/*
+	 * actually set mount's r/o flag here to make
+	 * __mnt_is_readonly() true, which keeps anyone
+	 * from doing a successful mnt_want_write().
+	 */
+out:
+	mnt_unlock_cpus();
+	return ret;
+}
+
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
diff -puN include/linux/mount.h~numa_mnt_want_write include/linux/mount.h
--- lxc/include/linux/mount.h~numa_mnt_want_write	2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/include/linux/mount.h	2007-06-22 10:21:39.000000000 -0700
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
@@ -61,6 +62,14 @@ struct vfsmount {
 	atomic_t mnt_count;
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
+
+	/*
+	 * This value is not stable unless all of the
+	 * mnt_writers[] spinlocks are held, and all
+	 * mnt_writer[]s on this mount have 0 as
+	 * their ->count
+	 */
+	atomic_t __mnt_writers;
 };
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
_
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html