Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Sun, 4 Mar 2012 21:38:31 -0800

On Sun, Mar 4, 2012 at 7:58 PM, Jason Garrett-Glaser <jason@xxxxxxxx> wrote:
>
> There is an improvement you can make to this.  "bsf" is microcoded in
> many future CPUs (e.g. Piledriver) in favor of tzcnt, which has
> slightly different flag behavior and no undefined behavior and is part
> of BMI1.

So I've gotten rid of 'bsf' because it really does have problems on
many CPU's. It's disgustingly slow on some older CPU's.

I asked around on G+ to see if that would be useful, and there's a
nice simple four-instruction sequence for the 32-bit case using just
trivial operations (one shift, one and, a couple of adds).

For the 64-bit case, the bsf can be replaced with a single multiply
and shift. The bsf is still better on some CPU's, but the single
multiply and shift is more consistently good - and as long as it
doesn't stall the CPU, we're good, because the end result of it all
won't be used until several cycles later.

So my current patch is attached - it does depend on the current -git
tree having moved dentry_cmp() into fs/dcache.c, so it's on top of
*tonights* -git tree, but this is something I'm pretty happy with, and
was planning on actually committing early in the 3.4 merge window.

My profiling seems to show that the multiply is pretty much free on
64-bit at least on the cpu's I have access to - it's not like a
multiply is free, but I do suspect it gets hidden very well by any OoO
instruction scheduling.

A bit-count instructions (popcount or bsf or tzcnt is obviously in
*theory* less work than a 64-bit multiply, but the multiply is
"portable". Even if it isn't optimal, it shouldn't be horrible on any
64-bit capable x86 CPU, and it also means (for example) that the code
might even work on non-x86 chips.

I did only very limited profiling of the 32-bit case, but it's really
just four cheap ALU instructions there and didn't really show up at
all in the limited profiles I did. And at least I checked that the
code worked. I have to say that the advantage of "vectorizing" this
code is obviously much less if you can only do 4-byte "vectors", so I
didn't actually time whether the patch *improves* anything on x86-32.

                                     Linus
 arch/x86/Kconfig |    1 +
 fs/Kconfig       |    4 ++
 fs/dcache.c      |   23 ++++++++++
 fs/namei.c       |  129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..09675d3e0ac3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
+	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/fs/Kconfig b/fs/Kconfig
index d621f02a3f9e..aa195265362f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
 
 menu "File systems"
 
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
+
 if BLOCK
 
 source "fs/ext2/Kconfig"
diff --git a/fs/dcache.c b/fs/dcache.c
index bcbdb33fcc20..ffd47a16d870 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 				const unsigned char *ct, size_t tcount)
 {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+	unsigned long a,b,mask;
+
+	if (unlikely(scount != tcount))
+		return 1;
+
+	for (;;) {
+		a = *(unsigned long *)cs;
+		b = *(unsigned long *)ct;
+		if (tcount < sizeof(unsigned long))
+			break;
+		if (unlikely(a != b))
+			return 1;
+		cs += sizeof(unsigned long);
+		ct += sizeof(unsigned long);
+		tcount -= sizeof(unsigned long);
+		if (!tcount)
+			return 0;
+	}
+	mask = ~(~0ul << tcount*8);
+	return unlikely(!!((a ^ b) & mask));
+#else
 	if (scount != tcount)
 		return 1;
 
@@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 		tcount--;
 	} while (tcount);
 	return 0;
+#endif
 }
 
 static void __d_free(struct rcu_head *head)
diff --git a/fs/namei.c b/fs/namei.c
index e2ba62820a0f..556778cd8b87 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1374,6 +1374,133 @@ static inline int can_lookup(struct inode *inode)
 	return 1;
 }
 
+/*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ *   do a "get_unaligned()" if this helps and is sufficiently
+ *   fast.
+ *
+ * - Little-endian machines (so that we can generate the mask
+ *   of low bytes efficiently). Again, we *could* do a byte
+ *   swapping load on big-endian architectures if that is not
+ *   expensive enough to make the optimization worthless.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ *   do not trap on the (extremely unlikely) case of a page
+ *   crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ *   64-bit case in order to generate the "number of bytes in
+ *   the final mask". Again, that could be replaced with a
+ *   efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+	return mask*0x0001020304050608 >> 56;
+}
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+	hash += hash >> (8*sizeof(int));
+	return hash;
+}
+
+#else	/* 32-bit case */
+
+/* Modified Carl Chatfield G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+	/*
+	 * (a) gives us
+	 *   -1 (0, ff), 0 (ffff) or 1 (ffffff)
+	 * (b) gives us
+	 *   0 for 0, 1 for (ff ffff ffffff)
+	 * (a+b+1) gives us
+	 *   correct 0-3 bytemask count result
+	 */
+	long a = (mask-256) >> 23;
+	long b = mask & 1;
+	return a + b + 1;
+}
+
+#define fold_hash(x) (x)
+
+#endif
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long a, mask;
+	unsigned long hash = 0;
+
+	for (;;) {
+		a = *(unsigned long *)name;
+		hash *= 9;
+		if (len < sizeof(unsigned long))
+			break;
+		hash += a;
+		name += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+		if (!len)
+			goto done;
+	}
+	mask = ~(~0ul << len*8);
+	hash += mask & a;
+done:
+	return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+#define ONEBYTES	0x0101010101010101ul
+#define SLASHBYTES	0x2f2f2f2f2f2f2f2ful
+#define HIGHBITS	0x8080808080808080ul
+
+/* Return the high bit set in the first byte that is a zero */
+static inline unsigned long has_zero(unsigned long a)
+{
+	return ((a - ONEBYTES) & ~a) & HIGHBITS;
+}
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+	unsigned long a, mask, hash, len;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = (hash + a) * 9;
+		len += sizeof(unsigned long);
+		a = *(unsigned long *)(name+len);
+		/* Do we have any NUL or '/' bytes in this word? */
+		mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
+	} while (!mask);
+
+	/* The mask *below* the first high bit set */
+	mask = (mask - 1) & ~mask;
+	mask >>= 7;
+	hash += a & mask;
+	*hashp = fold_hash(hash);
+
+	return len + count_masked_bytes(mask);
+}
+
+#else
+
 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
@@ -1402,6 +1529,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 	return len;
 }
 
+#endif
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into