On Sun, Mar 2, 2025 at 12:13 PM Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> wrote: > > Implement support in the verifier for replacing may_goto implementation > from a counter-based approach to one which samples time on the local CPU > to have a bigger loop bound. > > We implement it by maintaining 16-bytes per-stack frame, and using 8 > bytes for maintaining the count for amortizing time sampling, and 8 > bytes for the starting timestamp. To minimize overhead, we need to avoid > spilling and filling of registers around this sequence, so we push this > cost into the time sampling function 'arch_bpf_timed_may_goto'. This is > a JIT-specific wrapper around bpf_check_timed_may_goto which returns us > the count to store into the stack through BPF_REG_AX. All caller-saved > registers (r0-r5) are guaranteed to remain untouched. > > The loop can be broken by returning count as 0, otherwise we dispatch > into the function when the count becomes 1, and the runtime chooses to > refresh it (by returning count as BPF_MAX_TIMED_LOOPS) or returning 0 > and aborting it. > > Since the check for 0 is done right after loading the count from the > stack, all subsequent cond_break sequences should immediately break as > well. > > We pass in the stack_depth of the count (and thus the timestamp, by > adding 8 to it) to the arch_bpf_timed_may_goto call so that it can be > passed in to bpf_check_timed_may_goto as an argument after r1 is saved, > by adding the offset to r10/fp. This adjustment will be arch specific, > and the next patch will introduce support for x86. > > Note that depending on loop complexity, time spent in the loop can be > more than the current limit (250 ms), but imposing an upper bound on > program runtime is an orthogonal problem which will be addressed when > program cancellations are supported. > > The current time afforded by cond_break may not be enough for cases > where BPF programs want to implement locking algorithms inline, and use > cond_break as a promise to the verifier that they will eventually > terminate. > > Below are some benchmarking numbers on the time taken per-iteration for > an empty loop that counts the number of iterations until cond_break > fires. For comparison, we compare it against bpf_for/bpf_repeat which is > another way to achieve the same number of spins (BPF_MAX_LOOPS). The > hardware used for benchmarking was a Saphire Rapids Intel server with > performance governor enabled. > > +-----------------------------+--------------+--------------+------------------+ > | Loop type | Iterations | Time (ms) | Time/iter (ns) | > +-----------------------------|--------------+--------------+------------------+ > | may_goto | 8388608 | 3 | 0.36 | > | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | > | bpf_for | 8388608 | 10 | 1.19 | > +-----------------------------+--------------+--------------+------------------+ > > This gives a good approximation at low overhead while staying close to > the current implementation. > > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> > --- > include/linux/bpf.h | 1 + > include/linux/filter.h | 8 +++++++ > kernel/bpf/core.c | 31 +++++++++++++++++++++++++ > kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++++------- > 4 files changed, 84 insertions(+), 8 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index aec102868b93..788f6ca374e9 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1986,6 +1986,7 @@ struct bpf_array { > */ > enum { > BPF_MAX_LOOPS = 8 * 1024 * 1024, > + BPF_MAX_TIMED_LOOPS = 0xffff, > }; > > #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 3ed6eb9e7c73..02dda5c53d91 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -669,6 +669,11 @@ struct bpf_prog_stats { > struct u64_stats_sync syncp; > } __aligned(2 * sizeof(u64)); > > +struct bpf_timed_may_goto { > + u64 count; > + u64 timestamp; > +}; > + > struct sk_filter { > refcount_t refcnt; > struct rcu_head rcu; > @@ -1130,8 +1135,11 @@ bool bpf_jit_supports_ptr_xchg(void); > bool bpf_jit_supports_arena(void); > bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); > bool bpf_jit_supports_private_stack(void); > +bool bpf_jit_supports_timed_may_goto(void); > u64 bpf_arch_uaddress_limit(void); > void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); > +u64 arch_bpf_timed_may_goto(void); > +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *); > bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); > > static inline bool bpf_dump_raw_ok(const struct cred *cred) > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index a0200fbbace9..b3f7c7bd08d3 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -3069,6 +3069,37 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, > { > } > > +bool __weak bpf_jit_supports_timed_may_goto(void) > +{ > + return false; > +} > + > +u64 __weak arch_bpf_timed_may_goto(void) > +{ > + return 0; > +} > + > +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) > +{ > + u64 time = ktime_get_mono_fast_ns(); let's move the call after !p->count check to avoid unused work. > + > + /* If the count is zero, we've already broken a prior loop in this stack > + * frame, let's just exit quickly. > + */ Let's use normal kernel comment style in all new code. I think even netdev folks allow both styles now. > + if (!p->count) > + return 0; > + /* Populate the timestamp for this stack frame. */ > + if (!p->timestamp) { > + p->timestamp = time; > + return BPF_MAX_TIMED_LOOPS; > + } > + /* Check if we've exhausted our time slice. */ > + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) > + return 0; > + /* Refresh the count for the stack frame. */ > + return BPF_MAX_TIMED_LOOPS; > +} > + > /* for configs without MMU or 32-bit */ > __weak const struct bpf_map_ops arena_map_ops; > __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index dcd0da4e62fc..79bfb1932f40 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -21503,7 +21503,34 @@ static int do_misc_fixups(struct bpf_verifier_env *env) > goto next_insn; > } > > - if (is_may_goto_insn(insn)) { > + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { > + int stack_off_cnt = -stack_depth - 16; > + > + /* Two 8 byte slots, depth-16 stores the count, and > + * depth-8 stores the start timestamp of the loop. > + */ > + stack_depth_extra = 16; > + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); > + if (insn->off >= 0) > + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); > + else > + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); > + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); > + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 1, 2); Maybe != 0 instead ? Otherwise it's off by 1. > + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); Please add a comment that BPF_REG_AX is used as an argument register and contains return value too. I looked at a couple other non-x86 JITs and I think this calling convention should work for them too. > + insn_buf[5] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_CALL_IMM(arch_bpf_timed_may_goto)); Use BPF_EMIT_CALL() instead? > + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); > + cnt = 7; > + > + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); > + if (!new_prog) > + return -ENOMEM; > + > + delta += cnt - 1; > + env->prog = prog = new_prog; > + insn = new_prog->insnsi + i + delta; > + goto next_insn; > + } else if (is_may_goto_insn(insn)) { > int stack_off = -stack_depth - 8; > > stack_depth_extra = 8; > @@ -22044,23 +22071,32 @@ static int do_misc_fixups(struct bpf_verifier_env *env) > > env->prog->aux->stack_depth = subprogs[0].stack_depth; > for (i = 0; i < env->subprog_cnt; i++) { > + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; > int subprog_start = subprogs[i].start; > int stack_slots = subprogs[i].stack_extra / 8; > + int slots = delta, cnt = 0; > > if (!stack_slots) > continue; > - if (stack_slots > 1) { > + /* We need two in case timed may_goto is supported. */ > + if (stack_slots > slots) { > verbose(env, "verifier bug: stack_slots supports may_goto only\n"); > return -EFAULT; > } > > - /* Add ST insn to subprog prologue to init extra stack */ > - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, > - -subprogs[i].stack_depth, BPF_MAX_LOOPS); > + if (bpf_jit_supports_timed_may_goto()) { > + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -subprogs[i].stack_depth, > + BPF_MAX_TIMED_LOOPS); > + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -subprogs[i].stack_depth + 8, 0); > + } else { > + /* Add ST insn to subprog prologue to init extra stack */ > + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -subprogs[i].stack_depth, > + BPF_MAX_LOOPS); > + } > /* Copy first actual insn to preserve it */ > - insn_buf[1] = env->prog->insnsi[subprog_start]; > + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; > > - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); > + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); > if (!new_prog) > return -ENOMEM; > env->prog = prog = new_prog; > @@ -22070,7 +22106,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) > * to insn after BPF_ST that inits may_goto count. > * Adjustment will succeed because bpf_patch_insn_data() didn't fail. > */ > - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); > + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); > } > > /* Since poke tab is now finalized, publish aux to tracker. */ > -- > 2.43.5 >