Re: [RFC PATCH bpf-next 0/2] bpf: Add generic kfunc bpf_ffs64()

Leon Hwang <hffilwlqm@xxxxxxxxx> · Sun, 3 Mar 2024 21:18:38 +0800

On 2024/2/6 02:34, Yonghong Song wrote:
> 
> On 2/5/24 10:18 AM, Andrii Nakryiko wrote:
>> On Sun, Feb 4, 2024 at 11:20 AM Yonghong Song
>> <yonghong.song@xxxxxxxxx> wrote:
>>>
>>> On 2/2/24 2:18 PM, Andrii Nakryiko wrote:
>>>> On Wed, Jan 31, 2024 at 7:56 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote:
>>>>> This patchset introduces a new generic kfunc bpf_ffs64(). This kfunc
>>>>> allows bpf to reuse kernel's __ffs64() function to improve ffs
>>>>> performance in bpf.
>>>>>
>>>> The downside of using kfunc for this is that the compiler will assume
>>>> that R1-R5 have to be spilled/filled, because that's function call
>>>> convention in BPF.
>>>>
>>>> If this was an instruction, though, it would be much more efficient
>>>> and would avoid this problem. But I see how something like ffs64 is
>>>> useful. I think it would be good to also have popcnt instruction and a
>>>> few other fast bit manipulation operations as well.
>>>>
>>>> Perhaps we should think about another BPF ISA extension to add fast
>>>> bit manipulation instructions?
>>> Sounds a good idea to start the conversion. Besides popcnt, lzcnt
>>> is also a candidate. From llvm perspective, it would be hard to
>>> generate ffs64/popcnt/lzcnt etc. from source generic implementation.
>> I'm curious why? I assumed that if a user used __builtin_popcount()
>> Clang could just generate BPF's popcnt instruction (assuming the right
>> BPF cpu version is enabled, of course).
> 
> Not aware of __builtin_popcount(). Yes, BPF backend should be able easily
> converts __builtin_popcount() to a BPF insn.
> 
>>
>>> So most likely, inline asm will be used. libbpf could define
>>> some macros to make adoption easier. Verifier and JIT will do
>>> proper thing, either using corresponding arch insns directly or
>>> verifier will rewrite so JIT won't be aware of these insns.
> [...]


Sorry for late reply. I was busy trying to fix a tailcall issue[0]. But,
unluckily, it's hopeless to achieve it.

[0] bpf, x64: Fix tailcall hierarchy
    https://lore.kernel.org/bpf/20240222085232.62483-1-hffilwlqm@xxxxxxxxx/

It seems great that another BPF ISA extension adds fast bit manipulation
instructions. With assuming the right BPF cpu version, clang expects to
generate ffs64/popcnt/lzcnt BPF insn for
__builtin_ffs64()/__builtin_popcount()/__builtin_clz().

Then, I did a POC to do jit for this kfunc bpf_ffs64(). It's ok to do
jit for kfunc bpf_ffs64() with being aware of cpu features and adding
'BPF_ALU64|BPF_BITOPS'.

Here's the diff of the POC:

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e1390d1e3..9cd552dd7 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -18,6 +18,7 @@
 #include <asm/text-patching.h>
 #include <asm/unwind.h>
 #include <asm/cfi.h>
+#include <asm/cpufeatures.h>

 static bool all_callee_regs_used[4] = {true, true, true, true};

@@ -1131,6 +1132,39 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg,
u8 src_reg, bool is64, u8 op)
 	*pprog = prog;
 }

+static int emit_bitops(u8 **pprog, u32 bitops)
+{
+	u8 *prog = *pprog;
+
+	switch (bitops) {
+#ifdef X86_FEATURE_AVX2
+	case BPF_FFS64:
+		/* identical to tzcnt rax, rdi */
+		/* rep bsf rax, rdi */
+		EMIT1(0xF3);
+		EMIT4(0x48, 0x0F, 0xBC, 0xC7);
+		break;
+#endif
+#ifdef X86_FEATURE_XMM4_1
+	case BPF_POPCNT:
+		/* popcnt rax, rdi */
+		EMIT1(0xF3);
+		EMIT4(0X8, 0X0F, 0XB8, 0XC7);
+		break;
+	case BPF_LZCNT:
+		/* lzcnt rax, rdi */
+		EMIT1(0xF3);
+		EMIT4(0X8, 0X0F, 0XBD, 0XC7);
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	*pprog = prog;
+	return 0;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))

 /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
@@ -1521,6 +1555,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int
*addrs, u8 *image, u8 *rw_image
 			}
 			break;

+		case BPF_ALU64 | BPF_BITOPS:
+			err = emit_bitops(&prog, insn->imm);
+			if (err)
+				return err;
+			break;
+
 			/* speculation barrier */
 		case BPF_ST | BPF_NOSPEC:
 			EMIT_LFENCE();
@@ -3145,6 +3185,11 @@ bool bpf_jit_supports_subprog_tailcalls(void)
 	return true;
 }

+bool bpf_jit_supports_bitops(void)
+{
+	return true;
+}
+
 void bpf_jit_free(struct bpf_prog *prog)
 {
 	if (prog->jited) {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 36cc29a29..27ad34e20 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -959,6 +959,7 @@ bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
+bool bpf_jit_supports_bitops(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64
sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d96708380..0391e2d94 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -34,6 +34,12 @@
 #define BPF_FROM_LE	BPF_TO_LE
 #define BPF_FROM_BE	BPF_TO_BE

+/* bitops on a register */
+#define BPF_BITOPS	0xe0	/* flags for bitops */
+#define BPF_FFS64	0x00	/* opcode for ffs64 */
+#define BPF_POPCNT	0x01	/* opcode for popcnt */
+#define BPF_LZCNT	0x02	/* opcode for lzcnt */
+
 /* jmp encodings */
 #define BPF_JNE		0x50	/* jump != */
 #define BPF_JLT		0xa0	/* LT is unsigned, '<' */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 71c459a51..d90163ede 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2936,6 +2936,12 @@ bool __weak bpf_jit_supports_ptr_xchg(void)
 	return false;
 }

+/* return TRUE if the JIT backend supports bitops with few instructions. */
+bool __weak bpf_jit_supports_bitops(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 93edf730d..f5123e92f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -23,6 +23,7 @@
 #include <linux/btf_ids.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/kasan.h>
+#include <linux/bitops.h>

 #include "../../lib/kstrtox.h"

@@ -2542,6 +2543,11 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	WARN(1, "A call to BPF exception callback should never return\n");
 }

+__bpf_kfunc unsigned long bpf_ffs64(u64 word)
+{
+	return __ffs64(word);
+}
+
 __bpf_kfunc_end_defs();

 BTF_KFUNCS_START(generic_btf_ids)
@@ -2573,6 +2579,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1,
KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_throw)
+BTF_ID_FLAGS(func, bpf_ffs64)
 BTF_KFUNCS_END(generic_btf_ids)

 static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 011d54a1d..a5965e1b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10926,6 +10926,7 @@ enum special_kfunc_type {
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
 	KF_bpf_iter_css_task_new,
+	KF_bpf_ffs64,
 };

 BTF_SET_START(special_kfunc_set)
@@ -10952,6 +10953,7 @@ BTF_ID(func, bpf_throw)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #endif
+BTF_ID(func, bpf_ffs64)
 BTF_SET_END(special_kfunc_set)

 BTF_ID_LIST(special_kfunc_list)
@@ -10982,6 +10984,7 @@ BTF_ID(func, bpf_iter_css_task_new)
 #else
 BTF_ID_UNUSED
 #endif
+BTF_ID(func, bpf_ffs64)

 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -19349,6 +19352,10 @@ static int fixup_kfunc_call(struct
bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_ffs64]) {
+		insn_buf[0].code = BPF_ALU64 | BPF_BITOPS;
+		insn_buf[0].imm = BPF_FFS64;
+		*cnt = 1;
 	}
 	return 0;
 }

Thanks,
Leon