On 2023/8/18 23:12, Leon Hwang wrote: > From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall > handling in JIT"), the tailcall on x64 works better than before. > > From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms > for x64 JIT"), tailcall is able to run in BPF subprograms on x64. > > From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program > to other BPF programs"), BPF program is able to trace other BPF programs. > > How about combining them all together? > > 1. FENTRY/FEXIT on a BPF subprogram. > 2. A tailcall runs in the BPF subprogram. > 3. The tailcall calls itself. > > As a result, a tailcall infinite loop comes up. And the loop would halt > the machine. > > As we know, in tail call context, the tail_call_cnt propagates by stack > and RAX register between BPF subprograms. So do it in trampolines. > > Signed-off-by: Leon Hwang <hffilwlqm@xxxxxxxxx> > --- > arch/x86/net/bpf_jit_comp.c | 40 +++++++++++++++++++++++++++++-------- > include/linux/bpf.h | 5 +++++ > kernel/bpf/trampoline.c | 4 ++-- > kernel/bpf/verifier.c | 31 +++++++++++++++++++++------- > 4 files changed, 63 insertions(+), 17 deletions(-) > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index a5930042139d3..1ad17d7de5eee 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -303,8 +303,12 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > prog += X86_PATCH_SIZE; > if (!ebpf_from_cbpf) { > if (tail_call_reachable && !is_subprog) > + /* When it's the entry of the whole tailcall context, > + * zeroing rax means initialising tail_call_cnt. > + */ > EMIT2(0x31, 0xC0); /* xor eax, eax */ > else > + // Keep the same instruction layout. > EMIT2(0x66, 0x90); /* nop2 */ > } > EMIT1(0x55); /* push rbp */ > @@ -1018,6 +1022,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) > > #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) > > +/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ > +#define RESTORE_TAIL_CALL_CNT(stack) \ > + EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) > + > static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, > int oldproglen, struct jit_context *ctx, bool jmp_padding) > { > @@ -1623,9 +1631,7 @@ st: if (is_imm8(insn->off)) > > func = (u8 *) __bpf_call_base + imm32; > if (tail_call_reachable) { > - /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ > - EMIT3_off32(0x48, 0x8B, 0x85, > - -round_up(bpf_prog->aux->stack_depth, 8) - 8); > + RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); > if (!imm32) > return -EINVAL; > offs = 7 + x86_call_depth_emit_accounting(&prog, func); > @@ -2298,7 +2304,9 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, > * push rbp > * mov rbp, rsp > * sub rsp, 16 // space for skb and dev > - * push rbx // temp regs to pass start time > + * mov qword ptr [rbp - 40], rbx // temp regs to pass start time > + * mov rax, 2 // cache number of argument to rax > + * mov qword ptr [rbp - 32], rax // save number of argument to stack > * mov qword ptr [rbp - 16], rdi // save skb pointer to stack > * mov qword ptr [rbp - 8], rsi // save dev pointer to stack > * call __bpf_prog_enter // rcu_read_lock and preempt_disable > @@ -2323,7 +2331,9 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, > * push rbp > * mov rbp, rsp > * sub rsp, 24 // space for skb, dev, return value > - * push rbx // temp regs to pass start time > + * mov qword ptr [rbp - 40], rbx // temp regs to pass start time > + * mov rax, 2 // cache number of argument to rax > + * mov qword ptr [rbp - 32], rax // save number of argument to stack > * mov qword ptr [rbp - 24], rdi // save skb pointer to stack > * mov qword ptr [rbp - 16], rsi // save dev pointer to stack > * call __bpf_prog_enter // rcu_read_lock and preempt_disable > @@ -2400,6 +2410,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i > * [ ... ] > * [ stack_arg2 ] > * RBP - arg_stack_off [ stack_arg1 ] > + * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX > */ > > /* room for return value of orig_call or fentry prog */ > @@ -2464,6 +2475,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i > else > /* sub rsp, stack_size */ > EMIT4(0x48, 0x83, 0xEC, stack_size); > + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) > + EMIT1(0x50); /* push rax */ > /* mov QWORD PTR [rbp - rbx_off], rbx */ > emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); > > @@ -2516,9 +2529,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i > restore_regs(m, &prog, regs_off); > save_args(m, &prog, arg_stack_off, true); > > + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) > + /* Before calling the original function, restore the > + * tail_call_cnt from stack to rax. > + */ > + RESTORE_TAIL_CALL_CNT(stack_size); > + > if (flags & BPF_TRAMP_F_ORIG_STACK) { > - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); > - EMIT2(0xff, 0xd0); /* call *rax */ > + emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8); > + EMIT2(0xff, 0xd3); /* call *rbx */ // FIXME: Confirm 0xd3? To avoiding rax conflict with tail call, change the calling register from rax to rbx But, I'm unable to confirm the opcode. Then, I asked chatGPT to list `call` and corresponding opcode: Certainly! Here's a table that provides `call` instructions along with their corresponding opcodes in x86-64 assembly: | `call` Register | Opcode (Hex) | Opcode (Binary) | |-----------------|--------------|-----------------| | `rax` | `FF D0` | `11111111 11010000` | | `rcx` | `FF D1` | `11111111 11010001` | | `rdx` | `FF D2` | `11111111 11010010` | | `rbx` | `FF D3` | `11111111 11010011` | | `rsp` | `FF D4` | `11111111 11010100` | | `rbp` | `FF D5` | `11111111 11010101` | | `rsi` | `FF D6` | `11111111 11010110` | | `rdi` | `FF D7` | `11111111 11010111` | | `r8` | `41 FF D0` | `01000001 11111111 11010000` | | `r9` | `41 FF D1` | `01000001 11111111 11010001` | | `r10` | `41 FF D2` | `01000001 11111111 11010010` | | `r11` | `41 FF D3` | `01000001 11111111 11010011` | | `r12` | `41 FF D4` | `01000001 11111111 11010100` | | `r13` | `41 FF D5` | `01000001 11111111 11010101` | | `r14` | `41 FF D6` | `01000001 11111111 11010110` | | `r15` | `41 FF D7` | `01000001 11111111 11010111` | EMIT2(0xff, 0xd3); /* call *rbx */, is it right? Thanks, Leon [...]