Horrendous "runtime constant" hack - current patch x86-64 only

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Ok, this attached patch is so absolutely disgusting that it is almost
a work of art.

I spent some time last week doing arm64 profiles, and on the loads I
tested, I saw my old enemy __d_lookup_rcu(). The hash table lookup
ends up being expensive. Not a huge surprise.

That said, the expense of the hash table lookup is only partially the
memory accesses of the hashtable itself.  A noticeable part of the
cost is in looking up the address of the hash table.

That annoys me. It has annoyed me before. It's a "runtime constant".
In fact, it's two runtime constants: the address of the hash table,
and the shift count that turns the dentry name hash into the index
into the hash table (approximates a mask).

It's disgusting having the profile point to the "load constant from memory".

Peter Anvin at some point had some rather complex patch to do
"constant alternatives". I couldn't find it, but I didn't search very
hard because I remembered it being pretty significant in size, and I
went "how hard can it be".

Now, I did the profiling on arm64, but then when it came to rewriting
instructions I went back to x86-64 just because while I'm trying to
get better at reading arm64 asm, I don't want to deal with the pain of
huge constants (and a very slow boot for testing).

I'm posting this disgusting patch here because I need to take a break
from this insanity, and maybe somebody else is interested.

And yes, this needs to be behind some "CONFIG_RUNTIME_CONSTANTS"
config variable, with fallback to the same old code.

And yes, that static_shift_right_32() thing is odd. It takes and
returns an 'unsigned long', but then operates on the low 32 bits of
it, and clears the upper 32 bits (on 64-bit architectures). That's
purely because this is what x86-64 code generation wants to turn that
whole op into just a single instruction.

The static_const_init() sizes are also hardcoded, "knowing" what the layout is.

So this is all just a truly disgusting tech demo, but it generates
very pretty code in d_lookup_rcu().

Tested in the sense that it works for me in one particular
configuration using clang. The code from gcc looks fine to me too, but
that's from just quick "let's check".

Actually extending this to arm64 (and possibly other architectures)
would need some more cleanups and abstracting this all more. I didn't
look if other core kernel code might want to use this, I was literally
just concentrating on making __d_lookup_rcu() look pretty (and you
need to get rid of debug build options for it to do that)

               Linus
From 7e90fdee6dba0679b9a9010badbebe5850d3e2f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Tue, 4 Jun 2024 12:30:02 -0700
Subject: [PATCH] x86: start on 'static_const' infrastructure

This is very hacky indeed
---
 arch/x86/include/asm/static_const.h | 32 +++++++++++++++++
 fs/dcache.c                         | 55 +++++++++++++++++++++++++----
 include/asm-generic/vmlinux.lds.h   | 13 +++++++
 3 files changed, 93 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/include/asm/static_const.h

diff --git a/arch/x86/include/asm/static_const.h b/arch/x86/include/asm/static_const.h
new file mode 100644
index 000000000000..9b1560f32955
--- /dev/null
+++ b/arch/x86/include/asm/static_const.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_STATIC_CONST_H
+#define _ASM_STATIC_CONST_H
+
+#define static_long(sym) ({					\
+	long __ret;						\
+	asm("movq %1,%0\n1:\n"					\
+		".pushsection .static_const.sz%c2,\"a\"\n\t"	\
+		".long 1b - %c2 - .\n\t"			\
+		".long (" #sym ") - .\n"			\
+		".popsection"					\
+		:"=r" (__ret)					\
+		:"i" ((unsigned long)0x0123456789abcdefull),	\
+		 "i" (sizeof(long)));				\
+	__ret; })
+
+#define static_ptr(x) ((typeof(x))static_long(x))
+
+// The 'typeof' will create at _least_ a 32-bit type, but
+// will happily also take a bigger type and the 'shrl' will
+// clear the upper bits
+#define static_shift_right_32(val, sym) ({			\
+	typeof(0u+(val)) __ret = (val);				\
+	asm("shrl $12,%k0\n1:\n"				\
+		".pushsection .static_const.sz1,\"a\" \n\t"	\
+		".long 1b - 1 - .\n\t"				\
+		".long (" #sym ") - .\n"			\
+		".popsection"					\
+		:"+r" (__ret));					\
+	__ret; })
+
+#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index 407095188f83..b960b83dfa8a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -100,9 +100,12 @@ static unsigned int d_hash_shift __ro_after_init;
 
 static struct hlist_bl_head *dentry_hashtable __ro_after_init;
 
-static inline struct hlist_bl_head *d_hash(unsigned int hash)
+#include <asm/static_const.h>
+
+static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
 {
-	return dentry_hashtable + (hash >> d_hash_shift);
+	return static_ptr(dentry_hashtable) +
+		static_shift_right_32(hashlen, d_hash_shift);
 }
 
 #define IN_LOOKUP_SHIFT 10
@@ -495,7 +498,7 @@ static void ___d_drop(struct dentry *dentry)
 	if (unlikely(IS_ROOT(dentry)))
 		b = &dentry->d_sb->s_roots;
 	else
-		b = d_hash(dentry->d_name.hash);
+		b = d_hash(dentry->d_name.hash_len);
 
 	hlist_bl_lock(b);
 	__hlist_bl_del(&dentry->d_hash);
@@ -2104,7 +2107,7 @@ static noinline struct dentry *__d_lookup_rcu_op_compare(
 	unsigned *seqp)
 {
 	u64 hashlen = name->hash_len;
-	struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+	struct hlist_bl_head *b = d_hash(hashlen);
 	struct hlist_bl_node *node;
 	struct dentry *dentry;
 
@@ -2171,7 +2174,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
 {
 	u64 hashlen = name->hash_len;
 	const unsigned char *str = name->name;
-	struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+	struct hlist_bl_head *b = d_hash(hashlen);
 	struct hlist_bl_node *node;
 	struct dentry *dentry;
 
@@ -2277,7 +2280,7 @@ EXPORT_SYMBOL(d_lookup);
 struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
 {
 	unsigned int hash = name->hash;
-	struct hlist_bl_head *b = d_hash(hash);
+	struct hlist_bl_head *b = d_hash(name->hash_len);
 	struct hlist_bl_node *node;
 	struct dentry *found = NULL;
 	struct dentry *dentry;
@@ -2397,7 +2400,7 @@ EXPORT_SYMBOL(d_delete);
 
 static void __d_rehash(struct dentry *entry)
 {
-	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
+	struct hlist_bl_head *b = d_hash(entry->d_name.hash_len);
 
 	hlist_bl_lock(b);
 	hlist_bl_add_head_rcu(&entry->d_hash, b);
@@ -3110,6 +3113,38 @@ static int __init set_dhash_entries(char *str)
 }
 __setup("dhash_entries=", set_dhash_entries);
 
+#define static_const_init(sz, symbol) \
+	static_const_init_value(sz, &symbol, (unsigned long)(symbol))
+
+#define static_const_init_value(sz, addr, val) do {	\
+	extern s32 __static_const_sz##sz;		\
+	extern s32 __static_const_end_sz##sz;		\
+	static_const_fixup(sz, addr, val, 		\
+		&__static_const_sz##sz,			\
+		&__static_const_end_sz##sz);		\
+} while (0)
+
+#include <asm/text-patching.h>
+
+static void static_const_fixup(unsigned size,
+	void *addr, unsigned long val,
+	s32 *start, s32 *end)
+{
+	while (start < end) {
+		unsigned long where, sym;
+
+		where = start[0] + (unsigned long)(start+0);
+		sym = start[1] + (unsigned long)(start+1);
+		start += 2;
+
+		if (sym != (unsigned long)addr)
+			continue;
+
+		// HACK HACK HACK. This is little-endian only
+		text_poke_early((void *)where, &val, size);
+	}
+}
+
 static void __init dcache_init_early(void)
 {
 	/* If hashes are distributed across NUMA nodes, defer
@@ -3129,6 +3164,9 @@ static void __init dcache_init_early(void)
 					0,
 					0);
 	d_hash_shift = 32 - d_hash_shift;
+
+	static_const_init(1, d_hash_shift);
+	static_const_init(8, dentry_hashtable);
 }
 
 static void __init dcache_init(void)
@@ -3157,6 +3195,9 @@ static void __init dcache_init(void)
 					0,
 					0);
 	d_hash_shift = 32 - d_hash_shift;
+
+	static_const_init(1, d_hash_shift);
+	static_const_init(8, dentry_hashtable);
 }
 
 /* SLAB cache for __getname() consumers */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5703526d6ebf..31c167b4252d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -944,6 +944,18 @@
 #define CON_INITCALL							\
 	BOUNDED_SECTION_POST_LABEL(.con_initcall.init, __con_initcall, _start, _end)
 
+#define STATIC_CONST_SIZE(s)						\
+	__static_const_sz##s = .;					\
+	KEEP(*(.static_const.sz##s))					\
+	__static_const_end_sz##s = .;
+
+#define STATIC_CONSTS							\
+	. = ALIGN(8);							\
+	STATIC_CONST_SIZE(1)						\
+	STATIC_CONST_SIZE(2)						\
+	STATIC_CONST_SIZE(4)						\
+	STATIC_CONST_SIZE(8)
+
 /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
 #define KUNIT_TABLE()							\
 		. = ALIGN(8);						\
@@ -1160,6 +1172,7 @@
 	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {		\
 		INIT_DATA						\
 		INIT_SETUP(initsetup_align)				\
+		STATIC_CONSTS						\
 		INIT_CALLS						\
 		CON_INITCALL						\
 		INIT_RAM_FS						\
-- 
2.45.1.209.gc6f12300df


[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux