On 2024/2/15 07:16, Alexei Starovoitov wrote: > On Tue, Feb 13, 2024 at 9:47 PM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote: >> >> >> >> On 2024/1/5 12:15, Alexei Starovoitov wrote: >>> On Thu, Jan 4, 2024 at 6:23 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote: >>>> >>>> >>> >>> Other alternatives? >> >> I've finish the POC of an alternative, which passed all tailcall >> selftests including these tailcall hierarchy ones. >> >> In this alternative, I use a new bpf_prog_run_ctx to wrap the original >> ctx and the tcc_ptr, then get the tcc_ptr and recover the original ctx >> in JIT. >> >> Then, to avoid breaking runtime with tailcall on other arch, I add an >> arch-related check bpf_jit_supports_tail_call_cnt_ptr() to determin >> whether to use bpf_prog_run_ctx. >> [SNIP] >> + >> + if (bpf_jit_supports_tail_call_cnt_ptr() && prog->jited) >> + ret = dfunc(&run_ctx, prog->insnsi, prog->bpf_func); >> + else >> + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); > > This is no good either. > We cannot introduce two extra run-time checks before calling every bpf prog. > The solution must be overhead free for common cases. > > Can we switch to percpu tail_call_cnt instead of on stack and %rax tricks ? > Good idea to use percpu tail_call_cnt. I did another POC to use percpu tail_call_cnt, which passed all tailcall selftests too. In this POC, in order to prepare tcc_ptr at the prologue of x86 JIT, it's to call bpf_tail_call_cnt_prepare() to get the pointer that points to percpu tail_call_cnt, and to store the pointer to %rax meanwhile. Here's the diff: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4065bdcc5b2a4..fc1df6a7d87c9 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -241,6 +241,8 @@ int bpf_arch_text_invalidate(void *dst, size_t len) } struct jit_context { + int prologue_tail_call_offset; + int cleanup_addr; /* Epilogue code offset */ /* @@ -250,6 +252,8 @@ struct jit_context { */ int tail_call_direct_label; int tail_call_indirect_label; + + bool tail_call_reachable; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ @@ -259,7 +263,7 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 /* Number of bytes that will be skipped on tailcall */ -#define X86_TAIL_CALL_OFFSET (22 + ENDBR_INSN_SIZE) +#define X86_TAIL_CALL_OFFSET (14 + ENDBR_INSN_SIZE) static void push_r12(u8 **pprog) { @@ -389,6 +393,19 @@ static void emit_cfi(u8 **pprog, u32 hash) *pprog = prog; } +DEFINE_PER_CPU(u32, bpf_tail_call_cnt); + +__attribute__((used)) +static u32 *bpf_tail_call_cnt_prepare(void) +{ + u32 *tcc_ptr = this_cpu_ptr(&bpf_tail_call_cnt); + + /* Initialise tail_call_cnt. */ + *tcc_ptr = 0; + + return tcc_ptr; +} + /* * Emit x86-64 prologue code for BPF program. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes @@ -396,7 +413,7 @@ static void emit_cfi(u8 **pprog, u32 hash) */ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, - bool is_exception_cb) + bool is_exception_cb, struct jit_context *ctx) { u8 *prog = *pprog; @@ -406,21 +423,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, */ emit_nops(&prog, X86_PATCH_SIZE); if (!ebpf_from_cbpf) { - if (tail_call_reachable && !is_subprog) { - /* When it's the entry of the whole tailcall context, - * zeroing rax means initialising tail_call_cnt. - */ - EMIT2(0x31, 0xC0); /* xor eax, eax */ - EMIT1(0x50); /* push rax */ - /* Make rax as ptr that points to tail_call_cnt. */ - EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */ - EMIT1_off32(0xE8, 2); /* call main prog */ - EMIT1(0x59); /* pop rcx, get rid of tail_call_cnt */ - EMIT1(0xC3); /* ret */ - } else { - /* Keep the same instruction size. */ - emit_nops(&prog, 13); - } + /* These 5-bytes nops is prepared to emit_call() to call + * bpf_tail_call_cnt_prepare later. + * + * After calling bpf_tail_call_cnt_prepare, %rax will be + * the tail_call_cnt pointer that points to an initialised + * PER-CPU tail_call_cnt. + */ + ctx->prologue_tail_call_offset = prog - *pprog; + emit_nops(&prog, X86_PATCH_SIZE); } /* Exception callback receives FP as third parameter */ if (is_exception_cb) { @@ -583,6 +594,17 @@ static void emit_return(u8 **pprog, u8 *ip) *pprog = prog; } +static void bpf_tail_call_prologue_fixup(u8 *image, struct bpf_prog *prog, + struct jit_context *ctx) +{ + bool ebpf_from_cbpf = bpf_prog_was_classic(prog); + u8 *ip = image + ctx->prologue_tail_call_offset; + + if (!ebpf_from_cbpf && ctx->tail_call_reachable && !bpf_is_subprog(prog)) + __bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, + bpf_tail_call_cnt_prepare); +} + /* * Generate the following code: * @@ -1165,10 +1187,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image /* tail call's presence in current prog implies it is reachable */ tail_call_reachable |= tail_call_seen; + ctx->tail_call_reachable = tail_call_reachable; emit_prologue(&prog, bpf_prog->aux->stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb, + ctx); /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame. */ @@ -3097,6 +3121,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } bpf_tail_call_direct_fixup(prog); + bpf_tail_call_prologue_fixup(image, prog, &ctx); } else { jit_data->addrs = addrs; jit_data->ctx = ctx; Thanks, Leon