Re: [PATCH bpf-next v1 1/2] bpf: Add verifier support for timed may_goto

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Mon, 3 Mar 2025 13:59:32 -0800

On Sun, Mar 2, 2025 at 12:13 PM Kumar Kartikeya Dwivedi
<memxor@xxxxxxxxx> wrote:
>
> Implement support in the verifier for replacing may_goto implementation
> from a counter-based approach to one which samples time on the local CPU
> to have a bigger loop bound.
>
> We implement it by maintaining 16-bytes per-stack frame, and using 8
> bytes for maintaining the count for amortizing time sampling, and 8
> bytes for the starting timestamp. To minimize overhead, we need to avoid
> spilling and filling of registers around this sequence, so we push this
> cost into the time sampling function 'arch_bpf_timed_may_goto'. This is
> a JIT-specific wrapper around bpf_check_timed_may_goto which returns us
> the count to store into the stack through BPF_REG_AX. All caller-saved
> registers (r0-r5) are guaranteed to remain untouched.
>
> The loop can be broken by returning count as 0, otherwise we dispatch
> into the function when the count becomes 1, and the runtime chooses to
> refresh it (by returning count as BPF_MAX_TIMED_LOOPS) or returning 0
> and aborting it.
>
> Since the check for 0 is done right after loading the count from the
> stack, all subsequent cond_break sequences should immediately break as
> well.
>
> We pass in the stack_depth of the count (and thus the timestamp, by
> adding 8 to it) to the arch_bpf_timed_may_goto call so that it can be
> passed in to bpf_check_timed_may_goto as an argument after r1 is saved,
> by adding the offset to r10/fp. This adjustment will be arch specific,
> and the next patch will introduce support for x86.
>
> Note that depending on loop complexity, time spent in the loop can be
> more than the current limit (250 ms), but imposing an upper bound on
> program runtime is an orthogonal problem which will be addressed when
> program cancellations are supported.
>
> The current time afforded by cond_break may not be enough for cases
> where BPF programs want to implement locking algorithms inline, and use
> cond_break as a promise to the verifier that they will eventually
> terminate.
>
> Below are some benchmarking numbers on the time taken per-iteration for
> an empty loop that counts the number of iterations until cond_break
> fires. For comparison, we compare it against bpf_for/bpf_repeat which is
> another way to achieve the same number of spins (BPF_MAX_LOOPS).  The
> hardware used for benchmarking was a Saphire Rapids Intel server with
> performance governor enabled.
>
> +-----------------------------+--------------+--------------+------------------+
> | Loop type                   | Iterations   |  Time (ms)   |   Time/iter (ns) |
> +-----------------------------|--------------+--------------+------------------+
> | may_goto                    | 8388608      |  3           |   0.36           |
> | timed_may_goto (count=65535)| 589674932    |  250         |   0.42           |
> | bpf_for                     | 8388608      |  10          |   1.19           |
> +-----------------------------+--------------+--------------+------------------+
>
> This gives a good approximation at low overhead while staying close to
> the current implementation.
>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx>
> ---
>  include/linux/bpf.h    |  1 +
>  include/linux/filter.h |  8 +++++++
>  kernel/bpf/core.c      | 31 +++++++++++++++++++++++++
>  kernel/bpf/verifier.c  | 52 +++++++++++++++++++++++++++++++++++-------
>  4 files changed, 84 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index aec102868b93..788f6ca374e9 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1986,6 +1986,7 @@ struct bpf_array {
>   */
>  enum {
>         BPF_MAX_LOOPS = 8 * 1024 * 1024,
> +       BPF_MAX_TIMED_LOOPS = 0xffff,
>  };
>
>  #define BPF_F_ACCESS_MASK      (BPF_F_RDONLY |         \
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 3ed6eb9e7c73..02dda5c53d91 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -669,6 +669,11 @@ struct bpf_prog_stats {
>         struct u64_stats_sync syncp;
>  } __aligned(2 * sizeof(u64));
>
> +struct bpf_timed_may_goto {
> +       u64 count;
> +       u64 timestamp;
> +};
> +
>  struct sk_filter {
>         refcount_t      refcnt;
>         struct rcu_head rcu;
> @@ -1130,8 +1135,11 @@ bool bpf_jit_supports_ptr_xchg(void);
>  bool bpf_jit_supports_arena(void);
>  bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
>  bool bpf_jit_supports_private_stack(void);
> +bool bpf_jit_supports_timed_may_goto(void);
>  u64 bpf_arch_uaddress_limit(void);
>  void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
> +u64 arch_bpf_timed_may_goto(void);
> +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
>  bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);
>
>  static inline bool bpf_dump_raw_ok(const struct cred *cred)
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index a0200fbbace9..b3f7c7bd08d3 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -3069,6 +3069,37 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
>  {
>  }
>
> +bool __weak bpf_jit_supports_timed_may_goto(void)
> +{
> +       return false;
> +}
> +
> +u64 __weak arch_bpf_timed_may_goto(void)
> +{
> +       return 0;
> +}
> +
> +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
> +{
> +       u64 time = ktime_get_mono_fast_ns();

let's move the call after !p->count check to avoid unused work.

> +
> +       /* If the count is zero, we've already broken a prior loop in this stack
> +        * frame, let's just exit quickly.
> +        */

Let's use normal kernel comment style in all new code.
I think even netdev folks allow both styles now.

> +       if (!p->count)
> +               return 0;
> +       /* Populate the timestamp for this stack frame. */
> +       if (!p->timestamp) {
> +               p->timestamp = time;
> +               return BPF_MAX_TIMED_LOOPS;
> +       }
> +       /* Check if we've exhausted our time slice. */
> +       if (time - p->timestamp >= (NSEC_PER_SEC / 4))
> +               return 0;
> +       /* Refresh the count for the stack frame. */
> +       return BPF_MAX_TIMED_LOOPS;
> +}
> +
>  /* for configs without MMU or 32-bit */
>  __weak const struct bpf_map_ops arena_map_ops;
>  __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index dcd0da4e62fc..79bfb1932f40 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -21503,7 +21503,34 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         goto next_insn;
>                 }
>
> -               if (is_may_goto_insn(insn)) {
> +               if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
> +                       int stack_off_cnt = -stack_depth - 16;
> +
> +                       /* Two 8 byte slots, depth-16 stores the count, and
> +                        * depth-8 stores the start timestamp of the loop.
> +                        */
> +                       stack_depth_extra = 16;
> +                       insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
> +                       if (insn->off >= 0)
> +                               insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
> +                       else
> +                               insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
> +                       insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
> +                       insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 1, 2);

Maybe != 0 instead ?
Otherwise it's off by 1.

> +                       insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);

Please add a comment that BPF_REG_AX is used as an argument
register and contains return value too.

I looked at a couple other non-x86 JITs and I think this calling
convention should work for them too.

> +                       insn_buf[5] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_CALL_IMM(arch_bpf_timed_may_goto));

Use BPF_EMIT_CALL() instead?

> +                       insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
> +                       cnt = 7;
> +
> +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn = new_prog->insnsi + i + delta;
> +                       goto next_insn;
> +               } else if (is_may_goto_insn(insn)) {
>                         int stack_off = -stack_depth - 8;
>
>                         stack_depth_extra = 8;
> @@ -22044,23 +22071,32 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>
>         env->prog->aux->stack_depth = subprogs[0].stack_depth;
>         for (i = 0; i < env->subprog_cnt; i++) {
> +               int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
>                 int subprog_start = subprogs[i].start;
>                 int stack_slots = subprogs[i].stack_extra / 8;
> +               int slots = delta, cnt = 0;
>
>                 if (!stack_slots)
>                         continue;
> -               if (stack_slots > 1) {
> +               /* We need two in case timed may_goto is supported. */
> +               if (stack_slots > slots) {
>                         verbose(env, "verifier bug: stack_slots supports may_goto only\n");
>                         return -EFAULT;
>                 }
>
> -               /* Add ST insn to subprog prologue to init extra stack */
> -               insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
> -                                        -subprogs[i].stack_depth, BPF_MAX_LOOPS);
> +               if (bpf_jit_supports_timed_may_goto()) {
> +                       insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -subprogs[i].stack_depth,
> +                                                    BPF_MAX_TIMED_LOOPS);
> +                       insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -subprogs[i].stack_depth + 8, 0);
> +               } else {
> +                       /* Add ST insn to subprog prologue to init extra stack */
> +                       insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -subprogs[i].stack_depth,
> +                                                    BPF_MAX_LOOPS);
> +               }
>                 /* Copy first actual insn to preserve it */
> -               insn_buf[1] = env->prog->insnsi[subprog_start];
> +               insn_buf[cnt++] = env->prog->insnsi[subprog_start];
>
> -               new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
> +               new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
>                 if (!new_prog)
>                         return -ENOMEM;
>                 env->prog = prog = new_prog;
> @@ -22070,7 +22106,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                  * to insn after BPF_ST that inits may_goto count.
>                  * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
>                  */
> -               WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
> +               WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
>         }
>
>         /* Since poke tab is now finalized, publish aux to tracker. */
> --
> 2.43.5
>