On 2024/1/5 12:15, Alexei Starovoitov wrote: > On Thu, Jan 4, 2024 at 6:23 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote: >> >> > > Other alternatives? I've finish the POC of an alternative, which passed all tailcall selftests including these tailcall hierarchy ones. In this alternative, I use a new bpf_prog_run_ctx to wrap the original ctx and the tcc_ptr, then get the tcc_ptr and recover the original ctx in JIT. Then, to avoid breaking runtime with tailcall on other arch, I add an arch-related check bpf_jit_supports_tail_call_cnt_ptr() to determin whether to use bpf_prog_run_ctx. Here's the diff: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4065bdcc5b2a4..56cea2676863e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -259,7 +259,7 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 /* Number of bytes that will be skipped on tailcall */ -#define X86_TAIL_CALL_OFFSET (22 + ENDBR_INSN_SIZE) +#define X86_TAIL_CALL_OFFSET (16 + ENDBR_INSN_SIZE) static void push_r12(u8 **pprog) { @@ -407,21 +407,19 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, emit_nops(&prog, X86_PATCH_SIZE); if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) { - /* When it's the entry of the whole tailcall context, - * zeroing rax means initialising tail_call_cnt. - */ - EMIT2(0x31, 0xC0); /* xor eax, eax */ - EMIT1(0x50); /* push rax */ - /* Make rax as ptr that points to tail_call_cnt. */ - EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */ - EMIT1_off32(0xE8, 2); /* call main prog */ - EMIT1(0x59); /* pop rcx, get rid of tail_call_cnt */ - EMIT1(0xC3); /* ret */ + /* Make rax as tcc_ptr. */ + EMIT4(0x48, 0x8B, 0x47, 0x08); /* mov rax, qword ptr [rdi + 8] */ } else { - /* Keep the same instruction size. */ - emit_nops(&prog, 13); + /* Keep the same instruction layout. */ + emit_nops(&prog, 4); } } + if (!is_subprog) + /* Recover the original ctx. */ + EMIT3(0x48, 0x8B, 0x3F); /* mov rdi, qword ptr [rdi] */ + else + /* Keep the same instruction layout. */ + emit_nops(&prog, 3); /* Exception callback receives FP as third parameter */ if (is_exception_cb) { EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */ @@ -3152,6 +3150,12 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } +/* Indicate the JIT backend supports tail call count pointer in tailcall context. */ +bool bpf_jit_supports_tail_call_cnt_ptr(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7671530d6e4e0..fea4326c27d31 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1919,6 +1919,11 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, u64 bpf_cookie, struct bpf_prog_array **new_array); +struct bpf_prog_run_ctx { + const void *ctx; + u32 *tail_call_cnt; +}; + struct bpf_run_ctx {}; struct bpf_cg_run_ctx { diff --git a/include/linux/filter.h b/include/linux/filter.h index 68fb6c8142fec..c1c035c44b4ab 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -629,6 +629,10 @@ typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); +static __always_inline u32 __bpf_prog_run_dfunc(const struct bpf_prog *prog, + const void *ctx, + bpf_dispatcher_fn dfunc); + static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) @@ -641,14 +645,14 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, u64 start = sched_clock(); unsigned long flags; - ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + ret = __bpf_prog_run_dfunc(prog, ctx, dfunc); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { - ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + ret = __bpf_prog_run_dfunc(prog, ctx, dfunc); } return ret; } @@ -952,12 +956,31 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); +bool bpf_jit_supports_tail_call_cnt_ptr(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); +static __always_inline u32 __bpf_prog_run_dfunc(const struct bpf_prog *prog, + const void *ctx, + bpf_dispatcher_fn dfunc) +{ + struct bpf_prog_run_ctx run_ctx = {}; + u32 ret, tcc = 0; + + run_ctx.ctx = ctx; + run_ctx.tail_call_cnt = &tcc; + + if (bpf_jit_supports_tail_call_cnt_ptr() && prog->jited) + ret = dfunc(&run_ctx, prog->insnsi, prog->bpf_func); + else + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + + return ret; +} + static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ea6843be2616c..80b20e99456f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2915,6 +2915,15 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void) return false; } +/* Return TRUE if the JIT backend supports tail call count pointer in tailcall + * context. + */ +bool __weak bpf_jit_supports_tail_call_cnt_ptr(void) +{ + return false; +} +EXPORT_SYMBOL(bpf_jit_supports_tail_call_cnt_ptr); + bool __weak bpf_jit_supports_kfunc_call(void) { return false; Why use EXPORT_SYMBOL here? It's to avoid the building error. ERROR: modpost: "bpf_jit_supports_tail_call_cnt_ptr" [net/sched/act_bpf.ko] undefined! ERROR: modpost: "bpf_jit_supports_tail_call_cnt_ptr" [net/sched/cls_bpf.ko] undefined! ERROR: modpost: "bpf_jit_supports_tail_call_cnt_ptr" [net/netfilter/xt_bpf.ko] undefined! ERROR: modpost: "bpf_jit_supports_tail_call_cnt_ptr" [net/ipv6/ipv6.ko] undefined! I'm not familiar with this building error. Is it OK to use EXPORT_SYMBOL here? Thanks, Leon