On 2024/2/6 02:34, Yonghong Song wrote: > > On 2/5/24 10:18 AM, Andrii Nakryiko wrote: >> On Sun, Feb 4, 2024 at 11:20 AM Yonghong Song >> <yonghong.song@xxxxxxxxx> wrote: >>> >>> On 2/2/24 2:18 PM, Andrii Nakryiko wrote: >>>> On Wed, Jan 31, 2024 at 7:56 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote: >>>>> This patchset introduces a new generic kfunc bpf_ffs64(). This kfunc >>>>> allows bpf to reuse kernel's __ffs64() function to improve ffs >>>>> performance in bpf. >>>>> >>>> The downside of using kfunc for this is that the compiler will assume >>>> that R1-R5 have to be spilled/filled, because that's function call >>>> convention in BPF. >>>> >>>> If this was an instruction, though, it would be much more efficient >>>> and would avoid this problem. But I see how something like ffs64 is >>>> useful. I think it would be good to also have popcnt instruction and a >>>> few other fast bit manipulation operations as well. >>>> >>>> Perhaps we should think about another BPF ISA extension to add fast >>>> bit manipulation instructions? >>> Sounds a good idea to start the conversion. Besides popcnt, lzcnt >>> is also a candidate. From llvm perspective, it would be hard to >>> generate ffs64/popcnt/lzcnt etc. from source generic implementation. >> I'm curious why? I assumed that if a user used __builtin_popcount() >> Clang could just generate BPF's popcnt instruction (assuming the right >> BPF cpu version is enabled, of course). > > Not aware of __builtin_popcount(). Yes, BPF backend should be able easily > converts __builtin_popcount() to a BPF insn. > >> >>> So most likely, inline asm will be used. libbpf could define >>> some macros to make adoption easier. Verifier and JIT will do >>> proper thing, either using corresponding arch insns directly or >>> verifier will rewrite so JIT won't be aware of these insns. > [...] Sorry for late reply. I was busy trying to fix a tailcall issue[0]. But, unluckily, it's hopeless to achieve it. [0] bpf, x64: Fix tailcall hierarchy https://lore.kernel.org/bpf/20240222085232.62483-1-hffilwlqm@xxxxxxxxx/ It seems great that another BPF ISA extension adds fast bit manipulation instructions. With assuming the right BPF cpu version, clang expects to generate ffs64/popcnt/lzcnt BPF insn for __builtin_ffs64()/__builtin_popcount()/__builtin_clz(). Then, I did a POC to do jit for this kfunc bpf_ffs64(). It's ok to do jit for kfunc bpf_ffs64() with being aware of cpu features and adding 'BPF_ALU64|BPF_BITOPS'. Here's the diff of the POC: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e1390d1e3..9cd552dd7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -18,6 +18,7 @@ #include <asm/text-patching.h> #include <asm/unwind.h> #include <asm/cfi.h> +#include <asm/cpufeatures.h> static bool all_callee_regs_used[4] = {true, true, true, true}; @@ -1131,6 +1132,39 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) *pprog = prog; } +static int emit_bitops(u8 **pprog, u32 bitops) +{ + u8 *prog = *pprog; + + switch (bitops) { +#ifdef X86_FEATURE_AVX2 + case BPF_FFS64: + /* identical to tzcnt rax, rdi */ + /* rep bsf rax, rdi */ + EMIT1(0xF3); + EMIT4(0x48, 0x0F, 0xBC, 0xC7); + break; +#endif +#ifdef X86_FEATURE_XMM4_1 + case BPF_POPCNT: + /* popcnt rax, rdi */ + EMIT1(0xF3); + EMIT4(0X8, 0X0F, 0XB8, 0XC7); + break; + case BPF_LZCNT: + /* lzcnt rax, rdi */ + EMIT1(0xF3); + EMIT4(0X8, 0X0F, 0XBD, 0XC7); + break; +#endif + default: + return -EINVAL; + } + + *pprog = prog; + return 0; +} + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ @@ -1521,6 +1555,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image } break; + case BPF_ALU64 | BPF_BITOPS: + err = emit_bitops(&prog, insn->imm); + if (err) + return err; + break; + /* speculation barrier */ case BPF_ST | BPF_NOSPEC: EMIT_LFENCE(); @@ -3145,6 +3185,11 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } +bool bpf_jit_supports_bitops(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/include/linux/filter.h b/include/linux/filter.h index 36cc29a29..27ad34e20 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -959,6 +959,7 @@ bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); +bool bpf_jit_supports_bitops(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d96708380..0391e2d94 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -34,6 +34,12 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* bitops on a register */ +#define BPF_BITOPS 0xe0 /* flags for bitops */ +#define BPF_FFS64 0x00 /* opcode for ffs64 */ +#define BPF_POPCNT 0x01 /* opcode for popcnt */ +#define BPF_LZCNT 0x02 /* opcode for lzcnt */ + /* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ #define BPF_JLT 0xa0 /* LT is unsigned, '<' */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 71c459a51..d90163ede 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2936,6 +2936,12 @@ bool __weak bpf_jit_supports_ptr_xchg(void) return false; } +/* return TRUE if the JIT backend supports bitops with few instructions. */ +bool __weak bpf_jit_supports_bitops(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 93edf730d..f5123e92f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -23,6 +23,7 @@ #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> +#include <linux/bitops.h> #include "../../lib/kstrtox.h" @@ -2542,6 +2543,11 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } +__bpf_kfunc unsigned long bpf_ffs64(u64 word) +{ + return __ffs64(word); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2573,6 +2579,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +BTF_ID_FLAGS(func, bpf_ffs64) BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 011d54a1d..a5965e1b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10926,6 +10926,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, KF_bpf_iter_css_task_new, + KF_bpf_ffs64, }; BTF_SET_START(special_kfunc_set) @@ -10952,6 +10953,7 @@ BTF_ID(func, bpf_throw) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif +BTF_ID(func, bpf_ffs64) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10982,6 +10984,7 @@ BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_ffs64) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -19349,6 +19352,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_ffs64]) { + insn_buf[0].code = BPF_ALU64 | BPF_BITOPS; + insn_buf[0].imm = BPF_FFS64; + *cnt = 1; } return 0; } Thanks, Leon