Ok, this attached patch is so absolutely disgusting that it is almost a work of art. I spent some time last week doing arm64 profiles, and on the loads I tested, I saw my old enemy __d_lookup_rcu(). The hash table lookup ends up being expensive. Not a huge surprise. That said, the expense of the hash table lookup is only partially the memory accesses of the hashtable itself. A noticeable part of the cost is in looking up the address of the hash table. That annoys me. It has annoyed me before. It's a "runtime constant". In fact, it's two runtime constants: the address of the hash table, and the shift count that turns the dentry name hash into the index into the hash table (approximates a mask). It's disgusting having the profile point to the "load constant from memory". Peter Anvin at some point had some rather complex patch to do "constant alternatives". I couldn't find it, but I didn't search very hard because I remembered it being pretty significant in size, and I went "how hard can it be". Now, I did the profiling on arm64, but then when it came to rewriting instructions I went back to x86-64 just because while I'm trying to get better at reading arm64 asm, I don't want to deal with the pain of huge constants (and a very slow boot for testing). I'm posting this disgusting patch here because I need to take a break from this insanity, and maybe somebody else is interested. And yes, this needs to be behind some "CONFIG_RUNTIME_CONSTANTS" config variable, with fallback to the same old code. And yes, that static_shift_right_32() thing is odd. It takes and returns an 'unsigned long', but then operates on the low 32 bits of it, and clears the upper 32 bits (on 64-bit architectures). That's purely because this is what x86-64 code generation wants to turn that whole op into just a single instruction. The static_const_init() sizes are also hardcoded, "knowing" what the layout is. So this is all just a truly disgusting tech demo, but it generates very pretty code in d_lookup_rcu(). Tested in the sense that it works for me in one particular configuration using clang. The code from gcc looks fine to me too, but that's from just quick "let's check". Actually extending this to arm64 (and possibly other architectures) would need some more cleanups and abstracting this all more. I didn't look if other core kernel code might want to use this, I was literally just concentrating on making __d_lookup_rcu() look pretty (and you need to get rid of debug build options for it to do that) Linus
From 7e90fdee6dba0679b9a9010badbebe5850d3e2f3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Date: Tue, 4 Jun 2024 12:30:02 -0700 Subject: [PATCH] x86: start on 'static_const' infrastructure This is very hacky indeed --- arch/x86/include/asm/static_const.h | 32 +++++++++++++++++ fs/dcache.c | 55 +++++++++++++++++++++++++---- include/asm-generic/vmlinux.lds.h | 13 +++++++ 3 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 arch/x86/include/asm/static_const.h diff --git a/arch/x86/include/asm/static_const.h b/arch/x86/include/asm/static_const.h new file mode 100644 index 000000000000..9b1560f32955 --- /dev/null +++ b/arch/x86/include/asm/static_const.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_STATIC_CONST_H +#define _ASM_STATIC_CONST_H + +#define static_long(sym) ({ \ + long __ret; \ + asm("movq %1,%0\n1:\n" \ + ".pushsection .static_const.sz%c2,\"a\"\n\t" \ + ".long 1b - %c2 - .\n\t" \ + ".long (" #sym ") - .\n" \ + ".popsection" \ + :"=r" (__ret) \ + :"i" ((unsigned long)0x0123456789abcdefull), \ + "i" (sizeof(long))); \ + __ret; }) + +#define static_ptr(x) ((typeof(x))static_long(x)) + +// The 'typeof' will create at _least_ a 32-bit type, but +// will happily also take a bigger type and the 'shrl' will +// clear the upper bits +#define static_shift_right_32(val, sym) ({ \ + typeof(0u+(val)) __ret = (val); \ + asm("shrl $12,%k0\n1:\n" \ + ".pushsection .static_const.sz1,\"a\" \n\t" \ + ".long 1b - 1 - .\n\t" \ + ".long (" #sym ") - .\n" \ + ".popsection" \ + :"+r" (__ret)); \ + __ret; }) + +#endif diff --git a/fs/dcache.c b/fs/dcache.c index 407095188f83..b960b83dfa8a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -100,9 +100,12 @@ static unsigned int d_hash_shift __ro_after_init; static struct hlist_bl_head *dentry_hashtable __ro_after_init; -static inline struct hlist_bl_head *d_hash(unsigned int hash) +#include <asm/static_const.h> + +static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { - return dentry_hashtable + (hash >> d_hash_shift); + return static_ptr(dentry_hashtable) + + static_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 @@ -495,7 +498,7 @@ static void ___d_drop(struct dentry *dentry) if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else - b = d_hash(dentry->d_name.hash); + b = d_hash(dentry->d_name.hash_len); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); @@ -2104,7 +2107,7 @@ static noinline struct dentry *__d_lookup_rcu_op_compare( unsigned *seqp) { u64 hashlen = name->hash_len; - struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; @@ -2171,7 +2174,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, { u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; @@ -2277,7 +2280,7 @@ EXPORT_SYMBOL(d_lookup); struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; - struct hlist_bl_head *b = d_hash(hash); + struct hlist_bl_head *b = d_hash(name->hash_len); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; @@ -2397,7 +2400,7 @@ EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { - struct hlist_bl_head *b = d_hash(entry->d_name.hash); + struct hlist_bl_head *b = d_hash(entry->d_name.hash_len); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); @@ -3110,6 +3113,38 @@ static int __init set_dhash_entries(char *str) } __setup("dhash_entries=", set_dhash_entries); +#define static_const_init(sz, symbol) \ + static_const_init_value(sz, &symbol, (unsigned long)(symbol)) + +#define static_const_init_value(sz, addr, val) do { \ + extern s32 __static_const_sz##sz; \ + extern s32 __static_const_end_sz##sz; \ + static_const_fixup(sz, addr, val, \ + &__static_const_sz##sz, \ + &__static_const_end_sz##sz); \ +} while (0) + +#include <asm/text-patching.h> + +static void static_const_fixup(unsigned size, + void *addr, unsigned long val, + s32 *start, s32 *end) +{ + while (start < end) { + unsigned long where, sym; + + where = start[0] + (unsigned long)(start+0); + sym = start[1] + (unsigned long)(start+1); + start += 2; + + if (sym != (unsigned long)addr) + continue; + + // HACK HACK HACK. This is little-endian only + text_poke_early((void *)where, &val, size); + } +} + static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer @@ -3129,6 +3164,9 @@ static void __init dcache_init_early(void) 0, 0); d_hash_shift = 32 - d_hash_shift; + + static_const_init(1, d_hash_shift); + static_const_init(8, dentry_hashtable); } static void __init dcache_init(void) @@ -3157,6 +3195,9 @@ static void __init dcache_init(void) 0, 0); d_hash_shift = 32 - d_hash_shift; + + static_const_init(1, d_hash_shift); + static_const_init(8, dentry_hashtable); } /* SLAB cache for __getname() consumers */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5703526d6ebf..31c167b4252d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -944,6 +944,18 @@ #define CON_INITCALL \ BOUNDED_SECTION_POST_LABEL(.con_initcall.init, __con_initcall, _start, _end) +#define STATIC_CONST_SIZE(s) \ + __static_const_sz##s = .; \ + KEEP(*(.static_const.sz##s)) \ + __static_const_end_sz##s = .; + +#define STATIC_CONSTS \ + . = ALIGN(8); \ + STATIC_CONST_SIZE(1) \ + STATIC_CONST_SIZE(2) \ + STATIC_CONST_SIZE(4) \ + STATIC_CONST_SIZE(8) + /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ #define KUNIT_TABLE() \ . = ALIGN(8); \ @@ -1160,6 +1172,7 @@ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { \ INIT_DATA \ INIT_SETUP(initsetup_align) \ + STATIC_CONSTS \ INIT_CALLS \ CON_INITCALL \ INIT_RAM_FS \ -- 2.45.1.209.gc6f12300df