Two functions are added in the kernel - int notrace __bpf_prog_enter_recur_limited(struct bpf_prog *prog) - void notrace __bpf_prog_exit_recur_limited(struct bpf_prog *prog) and they are called in bpf progs through jit. Func __bpf_prog_enter_recur_limited() will return 0 if maximum recursion level has been reached in which case, bpf prog will return to the caller directly. Otherwise, it will return the current recursion level. The recursion level will be used by jit to calculated proper frame pointer for that recursion level. Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> --- arch/x86/net/bpf_jit_comp.c | 94 +++++++++++++++++++++++++++++++++---- include/linux/bpf.h | 2 + kernel/bpf/trampoline.c | 16 +++++++ 3 files changed, 104 insertions(+), 8 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 297dd64f4b6a..a763e018e87f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -501,7 +501,8 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) } static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog, - enum bpf_priv_stack_mode priv_stack_mode); + enum bpf_priv_stack_mode priv_stack_mode, + bool is_subprog, u8 *image, u8 *temp); /* * Emit x86-64 prologue code for BPF program. @@ -510,7 +511,8 @@ static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog, */ static void emit_prologue(u8 **pprog, u32 stack_depth, struct bpf_prog *bpf_prog, bool tail_call_reachable, - enum bpf_priv_stack_mode priv_stack_mode) + enum bpf_priv_stack_mode priv_stack_mode, u8 *image, + u8 *temp) { bool ebpf_from_cbpf = bpf_prog_was_classic(bpf_prog); bool is_exception_cb = bpf_prog->aux->exception_cb; @@ -554,7 +556,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, struct bpf_prog *bpf_prog /* X86_TAIL_CALL_OFFSET is here */ EMIT_ENDBR(); - emit_priv_frame_ptr(&prog, bpf_prog, priv_stack_mode); + emit_priv_frame_ptr(&prog, bpf_prog, priv_stack_mode, is_subprog, image, temp); /* sub rsp, rounded_stack_depth */ if (stack_depth) @@ -696,6 +698,15 @@ static void emit_return(u8 **pprog, u8 *ip) *pprog = prog; } +static int num_bytes_of_emit_return(void) +{ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return 5; + if (IS_ENABLED(CONFIG_MITIGATION_SLS)) + return 2; + return 1; +} + #define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) (-16 - round_up(stack, 8)) /* @@ -1527,17 +1538,67 @@ static void emit_root_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog, } static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog, - enum bpf_priv_stack_mode priv_stack_mode) + enum bpf_priv_stack_mode priv_stack_mode, + bool is_subprog, u8 *image, u8 *temp) { u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8); u8 *prog = *pprog; - if (priv_stack_mode == PRIV_STACK_ROOT_PROG) - emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth); - else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth) + if (priv_stack_mode == PRIV_STACK_ROOT_PROG) { + int offs; + u8 *func; + + if (!bpf_prog->aux->has_prog_call) { + emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth); + } else { + EMIT1(0x57); /* push rdi */ + if (is_subprog) { + /* subprog may have up to 5 arguments */ + EMIT1(0x56); /* push rsi */ + EMIT1(0x52); /* push rdx */ + EMIT1(0x51); /* push rcx */ + EMIT2(0x41, 0x50); /* push r8 */ + } + emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32, + (u32) (long) bpf_prog); + func = (u8 *)__bpf_prog_enter_recur_limited; + offs = prog - temp; + offs += x86_call_depth_emit_accounting(&prog, func, image + offs); + emit_call(&prog, func, image + offs); + if (is_subprog) { + EMIT2(0x41, 0x58); /* pop r8 */ + EMIT1(0x59); /* pop rcx */ + EMIT1(0x5a); /* pop rdx */ + EMIT1(0x5e); /* pop rsi */ + } + EMIT1(0x5f); /* pop rdi */ + + EMIT4(0x48, 0x83, 0xf8, 0x0); /* cmp rax,0x0 */ + EMIT2(X86_JNE, num_bytes_of_emit_return() + 1); + + /* return if stack recursion has been reached */ + EMIT1(0xC9); /* leave */ + emit_return(&prog, image + (prog - temp)); + + /* cnt -= 1 */ + emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K, + BPF_REG_0, 1); + + /* accum_stack_depth = cnt * subtree_stack_depth */ + emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0, + bpf_prog->aux->subtree_stack_depth); + + emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth); + + /* r9 += accum_stack_depth */ + emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9, + BPF_REG_0); + } + } else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth) { /* r9 += orig_stack_depth */ emit_alu_helper_1(&prog, BPF_ALU64 | BPF_ADD | BPF_K, X86_REG_R9, orig_stack_depth); + } *pprog = prog; } @@ -1578,7 +1639,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image detect_reg_usage(insn, insn_cnt, callee_regs_used); emit_prologue(&prog, stack_depth, bpf_prog, tail_call_reachable, - priv_stack_mode); + priv_stack_mode, image, temp); /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame. */ @@ -2519,6 +2580,23 @@ st: if (is_imm8(insn->off)) if (arena_vm_start) pop_r12(&prog); } + + if (bpf_prog->aux->has_prog_call) { + u8 *func, *ip; + int offs; + + ip = image + addrs[i - 1]; + /* save and restore the return value */ + EMIT1(0x50); /* push rax */ + emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32, + (u32) (long) bpf_prog); + func = (u8 *)__bpf_prog_exit_recur_limited; + offs = prog - temp; + offs += x86_call_depth_emit_accounting(&prog, func, ip + offs); + emit_call(&prog, func, ip + offs); + EMIT1(0x58); /* pop rax */ + } + EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 952cb398eb30..605004cba9f7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1148,6 +1148,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +int notrace __bpf_prog_enter_recur_limited(struct bpf_prog *prog); +void notrace __bpf_prog_exit_recur_limited(struct bpf_prog *prog); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f8302a5ca400..d9e7260e4b39 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -960,6 +960,22 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, rcu_read_unlock_trace(); } +int notrace __bpf_prog_enter_recur_limited(struct bpf_prog *prog) +{ + int cnt = this_cpu_inc_return(*(prog->active)); + + if (cnt > BPF_MAX_PRIV_STACK_NEST_LEVEL) { + bpf_prog_inc_misses_counter(prog); + return 0; + } + return cnt; +} + +void notrace __bpf_prog_exit_recur_limited(struct bpf_prog *prog) +{ + this_cpu_dec(*(prog->active)); +} + static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { -- 2.43.5