On Tue, Feb 13, 2024 at 9:47 PM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote: > > > > On 2024/1/5 12:15, Alexei Starovoitov wrote: > > On Thu, Jan 4, 2024 at 6:23 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote: > >> > >> > > > > Other alternatives? > > I've finish the POC of an alternative, which passed all tailcall > selftests including these tailcall hierarchy ones. > > In this alternative, I use a new bpf_prog_run_ctx to wrap the original > ctx and the tcc_ptr, then get the tcc_ptr and recover the original ctx > in JIT. > > Then, to avoid breaking runtime with tailcall on other arch, I add an > arch-related check bpf_jit_supports_tail_call_cnt_ptr() to determin > whether to use bpf_prog_run_ctx. > > Here's the diff: > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index 4065bdcc5b2a4..56cea2676863e 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -259,7 +259,7 @@ struct jit_context { > /* Number of bytes emit_patch() needs to generate instructions */ > #define X86_PATCH_SIZE 5 > /* Number of bytes that will be skipped on tailcall */ > -#define X86_TAIL_CALL_OFFSET (22 + ENDBR_INSN_SIZE) > +#define X86_TAIL_CALL_OFFSET (16 + ENDBR_INSN_SIZE) > > static void push_r12(u8 **pprog) > { > @@ -407,21 +407,19 @@ static void emit_prologue(u8 **pprog, u32 > stack_depth, bool ebpf_from_cbpf, > emit_nops(&prog, X86_PATCH_SIZE); > if (!ebpf_from_cbpf) { > if (tail_call_reachable && !is_subprog) { > - /* When it's the entry of the whole tailcall context, > - * zeroing rax means initialising tail_call_cnt. > - */ > - EMIT2(0x31, 0xC0); /* xor eax, eax */ > - EMIT1(0x50); /* push rax */ > - /* Make rax as ptr that points to tail_call_cnt. */ > - EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */ > - EMIT1_off32(0xE8, 2); /* call main prog */ > - EMIT1(0x59); /* pop rcx, get rid of tail_call_cnt */ > - EMIT1(0xC3); /* ret */ > + /* Make rax as tcc_ptr. */ > + EMIT4(0x48, 0x8B, 0x47, 0x08); /* mov rax, qword ptr [rdi + 8] */ > } else { > - /* Keep the same instruction size. */ > - emit_nops(&prog, 13); > + /* Keep the same instruction layout. */ > + emit_nops(&prog, 4); > } > } > + if (!is_subprog) > + /* Recover the original ctx. */ > + EMIT3(0x48, 0x8B, 0x3F); /* mov rdi, qword ptr [rdi] */ > + else > + /* Keep the same instruction layout. */ > + emit_nops(&prog, 3); > /* Exception callback receives FP as third parameter */ > if (is_exception_cb) { > EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */ > @@ -3152,6 +3150,12 @@ bool bpf_jit_supports_subprog_tailcalls(void) > return true; > } > > +/* Indicate the JIT backend supports tail call count pointer in > tailcall context. */ > +bool bpf_jit_supports_tail_call_cnt_ptr(void) > +{ > + return true; > +} > + > void bpf_jit_free(struct bpf_prog *prog) > { > if (prog->jited) { > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 7671530d6e4e0..fea4326c27d31 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1919,6 +1919,11 @@ int bpf_prog_array_copy(struct bpf_prog_array > *old_array, > u64 bpf_cookie, > struct bpf_prog_array **new_array); > > +struct bpf_prog_run_ctx { > + const void *ctx; > + u32 *tail_call_cnt; > +}; > + > struct bpf_run_ctx {}; > > struct bpf_cg_run_ctx { > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 68fb6c8142fec..c1c035c44b4ab 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -629,6 +629,10 @@ typedef unsigned int (*bpf_dispatcher_fn)(const > void *ctx, > unsigned int (*bpf_func)(const void *, > const struct bpf_insn *)); > > +static __always_inline u32 __bpf_prog_run_dfunc(const struct bpf_prog > *prog, > + const void *ctx, > + bpf_dispatcher_fn dfunc); > + > static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, > const void *ctx, > bpf_dispatcher_fn dfunc) > @@ -641,14 +645,14 @@ static __always_inline u32 __bpf_prog_run(const > struct bpf_prog *prog, > u64 start = sched_clock(); > unsigned long flags; > > - ret = dfunc(ctx, prog->insnsi, prog->bpf_func); > + ret = __bpf_prog_run_dfunc(prog, ctx, dfunc); > stats = this_cpu_ptr(prog->stats); > flags = u64_stats_update_begin_irqsave(&stats->syncp); > u64_stats_inc(&stats->cnt); > u64_stats_add(&stats->nsecs, sched_clock() - start); > u64_stats_update_end_irqrestore(&stats->syncp, flags); > } else { > - ret = dfunc(ctx, prog->insnsi, prog->bpf_func); > + ret = __bpf_prog_run_dfunc(prog, ctx, dfunc); > } > return ret; > } > @@ -952,12 +956,31 @@ struct bpf_prog *bpf_int_jit_compile(struct > bpf_prog *prog); > void bpf_jit_compile(struct bpf_prog *prog); > bool bpf_jit_needs_zext(void); > bool bpf_jit_supports_subprog_tailcalls(void); > +bool bpf_jit_supports_tail_call_cnt_ptr(void); > bool bpf_jit_supports_kfunc_call(void); > bool bpf_jit_supports_far_kfunc_call(void); > bool bpf_jit_supports_exceptions(void); > void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 > sp, u64 bp), void *cookie); > bool bpf_helper_changes_pkt_data(void *func); > > +static __always_inline u32 __bpf_prog_run_dfunc(const struct bpf_prog > *prog, > + const void *ctx, > + bpf_dispatcher_fn dfunc) > +{ > + struct bpf_prog_run_ctx run_ctx = {}; > + u32 ret, tcc = 0; > + > + run_ctx.ctx = ctx; > + run_ctx.tail_call_cnt = &tcc; > + > + if (bpf_jit_supports_tail_call_cnt_ptr() && prog->jited) > + ret = dfunc(&run_ctx, prog->insnsi, prog->bpf_func); > + else > + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); This is no good either. We cannot introduce two extra run-time checks before calling every bpf prog. The solution must be overhead free for common cases. Can we switch to percpu tail_call_cnt instead of on stack and %rax tricks ? If that won't work, then we'd have to disable tail_calls from subprogs in the verifier.