Re: [PATCH bpf-next v11 1/4] bpf: add bpf_get_cpu_time_counter kfunc

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Mon, 17 Mar 2025 17:23:37 -0700

On Mon, Mar 17, 2025 at 3:50 PM Vadim Fedorenko <vadfed@xxxxxxxx> wrote:
>
> New kfunc to return ARCH-specific timecounter. The main reason to
> implement this kfunc is to avoid extra overhead of benchmark
> measurements, which are usually done by a pair of bpf_ktime_get_ns()
> at the beginnig and at the end of the code block under benchmark.
> When fully JITed this function doesn't implement conversion to the
> monotonic clock and saves some CPU cycles by receiving timecounter
> values in single-digit amount of instructions. The delta values can be
> translated into nanoseconds using kfunc introduced in the next patch.
> For x86 BPF JIT converts this kfunc into rdtsc ordered call. Other
> architectures will get JIT implementation too if supported. The fallback
> is to get CLOCK_MONOTONIC_RAW value in ns.
>
> JIT version of the function uses "LFENCE; RDTSC" variant because it
> doesn't care about cookie value returned by "RDTSCP" and it doesn't want
> to trash RCX value. LFENCE option provides the same ordering guarantee as
> RDTSCP variant.
>
> The simplest use-case is added in 4th patch, where we calculate the time
> spent by bpf_get_ns_current_pid_tgid() kfunc. More complex example is to
> use session cookie to store timecounter value at kprobe/uprobe using
> kprobe.session/uprobe.session, and calculate the difference at
> kretprobe/uretprobe.
>
> Acked-by: Eduard Zingerman <eddyz87@xxxxxxxxx>
> Acked-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> Acked-by: Yonghong Song <yonghong.song@xxxxxxxxx>
> Signed-off-by: Vadim Fedorenko <vadfed@xxxxxxxx>
> ---
>  arch/x86/net/bpf_jit_comp.c   | 47 +++++++++++++++++++++++++++++++++++
>  arch/x86/net/bpf_jit_comp32.c | 33 ++++++++++++++++++++++++
>  include/linux/bpf.h           |  3 +++
>  include/linux/filter.h        |  1 +
>  kernel/bpf/core.c             | 11 ++++++++
>  kernel/bpf/helpers.c          |  6 +++++
>  kernel/bpf/verifier.c         | 41 +++++++++++++++++++++++++-----
>  7 files changed, 136 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index d3491cc0898b..92cd5945d630 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -15,6 +15,7 @@
>  #include <asm/ftrace.h>
>  #include <asm/set_memory.h>
>  #include <asm/nospec-branch.h>
> +#include <asm/timer.h>
>  #include <asm/text-patching.h>
>  #include <asm/unwind.h>
>  #include <asm/cfi.h>
> @@ -2254,6 +2255,40 @@ st:                      if (is_imm8(insn->off))
>                 case BPF_JMP | BPF_CALL: {
>                         u8 *ip = image + addrs[i - 1];
>
> +                       if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> +                           IS_ENABLED(CONFIG_BPF_SYSCALL) &&

why?

It's true that JIT can be compiled in even when there is no sys_bpf,
but why gate this?

> +                           imm32 == BPF_CALL_IMM(bpf_get_cpu_time_counter) &&
> +                           cpu_feature_enabled(X86_FEATURE_TSC) &&
> +                           using_native_sched_clock() && sched_clock_stable()) {
> +                               /* The default implementation of this kfunc uses
> +                                * ktime_get_raw_ns() which effectively is implemented as
> +                                * `(u64)rdtsc_ordered() & S64_MAX`. For JIT We skip
> +                                * masking part because we assume it's not needed in BPF
> +                                * use case (two measurements close in time).
> +                                * Original code for rdtsc_ordered() uses sequence:
> +                                * 'rdtsc; nop; nop; nop' to patch it into
> +                                * 'lfence; rdtsc' or 'rdtscp' depending on CPU features.
> +                                * JIT uses 'lfence; rdtsc' variant because BPF program
> +                                * doesn't care about cookie provided by rdtscp in RCX.
> +                                * Save RDX because RDTSC will use EDX:EAX to return u64
> +                                */
> +                               emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> +                               if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
> +                                       EMIT_LFENCE();
> +                               EMIT2(0x0F, 0x31);
> +
> +                               /* shl RDX, 32 */
> +                               maybe_emit_1mod(&prog, BPF_REG_3, true);
> +                               EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> +                               /* or RAX, RDX */
> +                               maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> +                               EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> +                               /* restore RDX from R11 */
> +                               emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> +
> +                               break;
> +                       }
> +
>                         func = (u8 *) __bpf_call_base + imm32;
>                         if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) {
>                                 LOAD_TAIL_CALL_CNT_PTR(stack_depth);
> @@ -3865,3 +3900,15 @@ bool bpf_jit_supports_timed_may_goto(void)
>  {
>         return true;
>  }
> +
> +/* x86-64 JIT can inline kfunc */
> +bool bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> +       if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
> +               return false;

This is certainly unnecessary.
Only the verifier calls this bpf_jit_inlines_kfunc_call() helper.

> +       if (imm == BPF_CALL_IMM(bpf_get_cpu_time_counter) &&
> +           cpu_feature_enabled(X86_FEATURE_TSC) &&
> +           using_native_sched_clock() && sched_clock_stable())

The duplication of the check is ugly.
Call this helper from an earlier bit ?
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
    bpf_jit_inlines_kfunc_call(imm32))
?

> +               return true;
> +       return false;
> +}
> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
> index de0f9e5f9f73..7f13509c66db 100644
> --- a/arch/x86/net/bpf_jit_comp32.c
> +++ b/arch/x86/net/bpf_jit_comp32.c
> @@ -16,6 +16,7 @@
>  #include <asm/set_memory.h>
>  #include <asm/nospec-branch.h>
>  #include <asm/asm-prototypes.h>
> +#include <asm/timer.h>
>  #include <linux/bpf.h>
>
>  /*
> @@ -2094,6 +2095,27 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
>                         if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>                                 int err;
>
> +                               if (IS_ENABLED(CONFIG_BPF_SYSCALL) &&

same.

> +                                   imm32 == BPF_CALL_IMM(bpf_get_cpu_time_counter) &&
> +                                   cpu_feature_enabled(X86_FEATURE_TSC) &&
> +                                   using_native_sched_clock() && sched_clock_stable()) {
> +                                       /* The default implementation of this kfunc uses
> +                                        * ktime_get_raw_ns() which effectively is implemented as
> +                                        * `(u64)rdtsc_ordered() & S64_MAX`. For JIT We skip
> +                                        * masking part because we assume it's not needed in BPF
> +                                        * use case (two measurements close in time).
> +                                        * Original code for rdtsc_ordered() uses sequence:
> +                                        * 'rdtsc; nop; nop; nop' to patch it into
> +                                        * 'lfence; rdtsc' or 'rdtscp' depending on CPU features.
> +                                        * JIT uses 'lfence; rdtsc' variant because BPF program
> +                                        * doesn't care about cookie provided by rdtscp in ECX.
> +                                        */
> +                                       if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
> +                                               EMIT3(0x0F, 0xAE, 0xE8);
> +                                       EMIT2(0x0F, 0x31);
> +                                       break;
> +                               }
> +
>                                 err = emit_kfunc_call(bpf_prog,
>                                                       image + addrs[i],
>                                                       insn, &prog);
> @@ -2621,3 +2643,14 @@ bool bpf_jit_supports_kfunc_call(void)
>  {
>         return true;
>  }
> +
> +bool bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> +       if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
> +               return false;
> +       if (imm == BPF_CALL_IMM(bpf_get_cpu_time_counter) &&
> +           cpu_feature_enabled(X86_FEATURE_TSC) &&
> +           using_native_sched_clock() && sched_clock_stable())
> +               return true;

same issue.

> +       return false;
> +}
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0d7b70124d81..a5e9b592d3e8 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -3387,6 +3387,9 @@ void bpf_user_rnd_init_once(void);
>  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>  u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>
> +/* Inlined kfuncs */
> +u64 bpf_get_cpu_time_counter(void);
> +
>  #if defined(CONFIG_NET)
>  bool bpf_sock_common_is_valid_access(int off, int size,
>                                      enum bpf_access_type type,
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 590476743f7a..2fbfa1bc3f49 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1128,6 +1128,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
>  void bpf_jit_compile(struct bpf_prog *prog);
>  bool bpf_jit_needs_zext(void);
>  bool bpf_jit_inlines_helper_call(s32 imm);
> +bool bpf_jit_inlines_kfunc_call(s32 imm);
>  bool bpf_jit_supports_subprog_tailcalls(void);
>  bool bpf_jit_supports_percpu_insn(void);
>  bool bpf_jit_supports_kfunc_call(void);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 62cb9557ad3b..1d811fc39eac 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -3035,6 +3035,17 @@ bool __weak bpf_jit_inlines_helper_call(s32 imm)
>         return false;
>  }
>
> +/* Return true if the JIT inlines the call to the kfunc corresponding to
> + * the imm.
> + *
> + * The verifier will not patch the insn->imm for the call to the helper if
> + * this returns true.
> + */
> +bool __weak bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> +       return false;
> +}
> +
>  /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
>  bool __weak bpf_jit_supports_subprog_tailcalls(void)
>  {
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 5449756ba102..43bf35a15f78 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -3193,6 +3193,11 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
>         local_irq_restore(*flags__irq_flag);
>  }
>
> +__bpf_kfunc u64 bpf_get_cpu_time_counter(void)
> +{
> +       return ktime_get_raw_fast_ns();

Why 'raw' ?
Is it faster than 'mono' ?
This needs a comment at least.

> +}
> +
>  __bpf_kfunc_end_defs();
>
>  BTF_KFUNCS_START(generic_btf_ids)
> @@ -3293,6 +3298,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
>  BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
>  BTF_ID_FLAGS(func, bpf_local_irq_save)
>  BTF_ID_FLAGS(func, bpf_local_irq_restore)
> +BTF_ID_FLAGS(func, bpf_get_cpu_time_counter, KF_FASTCALL)
>  BTF_KFUNCS_END(common_btf_ids)
>
>  static const struct btf_kfunc_id_set common_kfunc_set = {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 3303a3605ee8..0c4ea977973c 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -17035,6 +17035,24 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
>         }
>  }
>
> +/* True if fixup_kfunc_call() replaces calls to kfunc number 'imm',
> + * replacement patch is presumed to follow bpf_fastcall contract
> + * (see mark_fastcall_pattern_for_call() below).
> + */
> +static bool verifier_inlines_kfunc_call(struct bpf_verifier_env *env, s32 imm)
> +{
> +       const struct bpf_kfunc_desc *desc = find_kfunc_desc(env->prog, imm, 0);
> +
> +       if (!env->prog->jit_requested)
> +               return false;

This is a regression.
Why disable verifier inlining when jit is off?

> +
> +       if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> +           desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast])
> +               return true;
> +
> +       return false;
> +}
> +
>  struct call_summary {
>         u8 num_params;
>         bool is_void;
> @@ -17077,7 +17095,10 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
>                         /* error would be reported later */
>                         return false;
>                 cs->num_params = btf_type_vlen(meta.func_proto);
> -               cs->fastcall = meta.kfunc_flags & KF_FASTCALL;
> +               cs->fastcall = meta.kfunc_flags & KF_FASTCALL &&
> +                              (verifier_inlines_kfunc_call(env, call->imm) ||
> +                              (meta.btf == btf_vmlinux &&
> +                              bpf_jit_inlines_kfunc_call(call->imm)));
>                 cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type));
>                 return true;
>         }
> @@ -21223,6 +21244,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>                             struct bpf_insn *insn_buf, int insn_idx, int *cnt)
>  {
>         const struct bpf_kfunc_desc *desc;
> +       s32 imm = insn->imm;
>
>         if (!insn->imm) {
>                 verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
> @@ -21246,7 +21268,18 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>                 insn->imm = BPF_CALL_IMM(desc->addr);
>         if (insn->off)
>                 return 0;
> -       if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
> +       if (verifier_inlines_kfunc_call(env, imm)) {
> +               if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> +                   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {

I really don't like this copy paste.
Next trivial function that inlines as r0=r1
would need to add itself in two places for no good reason.

pw-bot: cr