On Mon, Mar 17, 2025 at 3:50 PM Vadim Fedorenko <vadfed@xxxxxxxx> wrote: > > New kfunc to return ARCH-specific timecounter. The main reason to > implement this kfunc is to avoid extra overhead of benchmark > measurements, which are usually done by a pair of bpf_ktime_get_ns() > at the beginnig and at the end of the code block under benchmark. > When fully JITed this function doesn't implement conversion to the > monotonic clock and saves some CPU cycles by receiving timecounter > values in single-digit amount of instructions. The delta values can be > translated into nanoseconds using kfunc introduced in the next patch. > For x86 BPF JIT converts this kfunc into rdtsc ordered call. Other > architectures will get JIT implementation too if supported. The fallback > is to get CLOCK_MONOTONIC_RAW value in ns. > > JIT version of the function uses "LFENCE; RDTSC" variant because it > doesn't care about cookie value returned by "RDTSCP" and it doesn't want > to trash RCX value. LFENCE option provides the same ordering guarantee as > RDTSCP variant. > > The simplest use-case is added in 4th patch, where we calculate the time > spent by bpf_get_ns_current_pid_tgid() kfunc. More complex example is to > use session cookie to store timecounter value at kprobe/uprobe using > kprobe.session/uprobe.session, and calculate the difference at > kretprobe/uretprobe. > > Acked-by: Eduard Zingerman <eddyz87@xxxxxxxxx> > Acked-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > Acked-by: Yonghong Song <yonghong.song@xxxxxxxxx> > Signed-off-by: Vadim Fedorenko <vadfed@xxxxxxxx> > --- > arch/x86/net/bpf_jit_comp.c | 47 +++++++++++++++++++++++++++++++++++ > arch/x86/net/bpf_jit_comp32.c | 33 ++++++++++++++++++++++++ > include/linux/bpf.h | 3 +++ > include/linux/filter.h | 1 + > kernel/bpf/core.c | 11 ++++++++ > kernel/bpf/helpers.c | 6 +++++ > kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++----- > 7 files changed, 136 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index d3491cc0898b..92cd5945d630 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -15,6 +15,7 @@ > #include <asm/ftrace.h> > #include <asm/set_memory.h> > #include <asm/nospec-branch.h> > +#include <asm/timer.h> > #include <asm/text-patching.h> > #include <asm/unwind.h> > #include <asm/cfi.h> > @@ -2254,6 +2255,40 @@ st: if (is_imm8(insn->off)) > case BPF_JMP | BPF_CALL: { > u8 *ip = image + addrs[i - 1]; > > + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && > + IS_ENABLED(CONFIG_BPF_SYSCALL) && why? It's true that JIT can be compiled in even when there is no sys_bpf, but why gate this? > + imm32 == BPF_CALL_IMM(bpf_get_cpu_time_counter) && > + cpu_feature_enabled(X86_FEATURE_TSC) && > + using_native_sched_clock() && sched_clock_stable()) { > + /* The default implementation of this kfunc uses > + * ktime_get_raw_ns() which effectively is implemented as > + * `(u64)rdtsc_ordered() & S64_MAX`. For JIT We skip > + * masking part because we assume it's not needed in BPF > + * use case (two measurements close in time). > + * Original code for rdtsc_ordered() uses sequence: > + * 'rdtsc; nop; nop; nop' to patch it into > + * 'lfence; rdtsc' or 'rdtscp' depending on CPU features. > + * JIT uses 'lfence; rdtsc' variant because BPF program > + * doesn't care about cookie provided by rdtscp in RCX. > + * Save RDX because RDTSC will use EDX:EAX to return u64 > + */ > + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3); > + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) > + EMIT_LFENCE(); > + EMIT2(0x0F, 0x31); > + > + /* shl RDX, 32 */ > + maybe_emit_1mod(&prog, BPF_REG_3, true); > + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32); > + /* or RAX, RDX */ > + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true); > + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3)); > + /* restore RDX from R11 */ > + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG); > + > + break; > + } > + > func = (u8 *) __bpf_call_base + imm32; > if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { > LOAD_TAIL_CALL_CNT_PTR(stack_depth); > @@ -3865,3 +3900,15 @@ bool bpf_jit_supports_timed_may_goto(void) > { > return true; > } > + > +/* x86-64 JIT can inline kfunc */ > +bool bpf_jit_inlines_kfunc_call(s32 imm) > +{ > + if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) > + return false; This is certainly unnecessary. Only the verifier calls this bpf_jit_inlines_kfunc_call() helper. > + if (imm == BPF_CALL_IMM(bpf_get_cpu_time_counter) && > + cpu_feature_enabled(X86_FEATURE_TSC) && > + using_native_sched_clock() && sched_clock_stable()) The duplication of the check is ugly. Call this helper from an earlier bit ? if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && bpf_jit_inlines_kfunc_call(imm32)) ? > + return true; > + return false; > +} > diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c > index de0f9e5f9f73..7f13509c66db 100644 > --- a/arch/x86/net/bpf_jit_comp32.c > +++ b/arch/x86/net/bpf_jit_comp32.c > @@ -16,6 +16,7 @@ > #include <asm/set_memory.h> > #include <asm/nospec-branch.h> > #include <asm/asm-prototypes.h> > +#include <asm/timer.h> > #include <linux/bpf.h> > > /* > @@ -2094,6 +2095,27 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, > if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { > int err; > > + if (IS_ENABLED(CONFIG_BPF_SYSCALL) && same. > + imm32 == BPF_CALL_IMM(bpf_get_cpu_time_counter) && > + cpu_feature_enabled(X86_FEATURE_TSC) && > + using_native_sched_clock() && sched_clock_stable()) { > + /* The default implementation of this kfunc uses > + * ktime_get_raw_ns() which effectively is implemented as > + * `(u64)rdtsc_ordered() & S64_MAX`. For JIT We skip > + * masking part because we assume it's not needed in BPF > + * use case (two measurements close in time). > + * Original code for rdtsc_ordered() uses sequence: > + * 'rdtsc; nop; nop; nop' to patch it into > + * 'lfence; rdtsc' or 'rdtscp' depending on CPU features. > + * JIT uses 'lfence; rdtsc' variant because BPF program > + * doesn't care about cookie provided by rdtscp in ECX. > + */ > + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) > + EMIT3(0x0F, 0xAE, 0xE8); > + EMIT2(0x0F, 0x31); > + break; > + } > + > err = emit_kfunc_call(bpf_prog, > image + addrs[i], > insn, &prog); > @@ -2621,3 +2643,14 @@ bool bpf_jit_supports_kfunc_call(void) > { > return true; > } > + > +bool bpf_jit_inlines_kfunc_call(s32 imm) > +{ > + if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) > + return false; > + if (imm == BPF_CALL_IMM(bpf_get_cpu_time_counter) && > + cpu_feature_enabled(X86_FEATURE_TSC) && > + using_native_sched_clock() && sched_clock_stable()) > + return true; same issue. > + return false; > +} > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 0d7b70124d81..a5e9b592d3e8 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -3387,6 +3387,9 @@ void bpf_user_rnd_init_once(void); > u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > > +/* Inlined kfuncs */ > +u64 bpf_get_cpu_time_counter(void); > + > #if defined(CONFIG_NET) > bool bpf_sock_common_is_valid_access(int off, int size, > enum bpf_access_type type, > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 590476743f7a..2fbfa1bc3f49 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -1128,6 +1128,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); > void bpf_jit_compile(struct bpf_prog *prog); > bool bpf_jit_needs_zext(void); > bool bpf_jit_inlines_helper_call(s32 imm); > +bool bpf_jit_inlines_kfunc_call(s32 imm); > bool bpf_jit_supports_subprog_tailcalls(void); > bool bpf_jit_supports_percpu_insn(void); > bool bpf_jit_supports_kfunc_call(void); > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 62cb9557ad3b..1d811fc39eac 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -3035,6 +3035,17 @@ bool __weak bpf_jit_inlines_helper_call(s32 imm) > return false; > } > > +/* Return true if the JIT inlines the call to the kfunc corresponding to > + * the imm. > + * > + * The verifier will not patch the insn->imm for the call to the helper if > + * this returns true. > + */ > +bool __weak bpf_jit_inlines_kfunc_call(s32 imm) > +{ > + return false; > +} > + > /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ > bool __weak bpf_jit_supports_subprog_tailcalls(void) > { > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 5449756ba102..43bf35a15f78 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -3193,6 +3193,11 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) > local_irq_restore(*flags__irq_flag); > } > > +__bpf_kfunc u64 bpf_get_cpu_time_counter(void) > +{ > + return ktime_get_raw_fast_ns(); Why 'raw' ? Is it faster than 'mono' ? This needs a comment at least. > +} > + > __bpf_kfunc_end_defs(); > > BTF_KFUNCS_START(generic_btf_ids) > @@ -3293,6 +3298,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE > BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) > BTF_ID_FLAGS(func, bpf_local_irq_save) > BTF_ID_FLAGS(func, bpf_local_irq_restore) > +BTF_ID_FLAGS(func, bpf_get_cpu_time_counter, KF_FASTCALL) > BTF_KFUNCS_END(common_btf_ids) > > static const struct btf_kfunc_id_set common_kfunc_set = { > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 3303a3605ee8..0c4ea977973c 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -17035,6 +17035,24 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) > } > } > > +/* True if fixup_kfunc_call() replaces calls to kfunc number 'imm', > + * replacement patch is presumed to follow bpf_fastcall contract > + * (see mark_fastcall_pattern_for_call() below). > + */ > +static bool verifier_inlines_kfunc_call(struct bpf_verifier_env *env, s32 imm) > +{ > + const struct bpf_kfunc_desc *desc = find_kfunc_desc(env->prog, imm, 0); > + > + if (!env->prog->jit_requested) > + return false; This is a regression. Why disable verifier inlining when jit is off? > + > + if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || > + desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) > + return true; > + > + return false; > +} > + > struct call_summary { > u8 num_params; > bool is_void; > @@ -17077,7 +17095,10 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call > /* error would be reported later */ > return false; > cs->num_params = btf_type_vlen(meta.func_proto); > - cs->fastcall = meta.kfunc_flags & KF_FASTCALL; > + cs->fastcall = meta.kfunc_flags & KF_FASTCALL && > + (verifier_inlines_kfunc_call(env, call->imm) || > + (meta.btf == btf_vmlinux && > + bpf_jit_inlines_kfunc_call(call->imm))); > cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); > return true; > } > @@ -21223,6 +21244,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > struct bpf_insn *insn_buf, int insn_idx, int *cnt) > { > const struct bpf_kfunc_desc *desc; > + s32 imm = insn->imm; > > if (!insn->imm) { > verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); > @@ -21246,7 +21268,18 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > insn->imm = BPF_CALL_IMM(desc->addr); > if (insn->off) > return 0; > - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || > + if (verifier_inlines_kfunc_call(env, imm)) { > + if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || > + desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { I really don't like this copy paste. Next trivial function that inlines as r0=r1 would need to add itself in two places for no good reason. pw-bot: cr