On Thu, Sep 26, 2024 at 4:45 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > Add jit support for private stack. For a particular subtree, e.g., > subtree_root <== stack depth 120 > subprog1 <== stack depth 80 > subprog2 <== stack depth 40 > subprog3 <== stack depth 160 > > Let us say that private_stack_ptr is the memory address allocated for > private stack. The frame pointer for each above is calculated like below: > subtree_root <== subtree_root_fp = private_stack_ptr + 120 > subprog1 <== subtree_subprog1_fp = subtree_root_fp + 80 > subprog2 <== subtree_subprog2_fp = subtree_subprog1_fp + 40 > subprog3 <== subtree_subprog1_fp = subtree_root_fp + 160 > > For any function call to helper/kfunc, push/pop prog frame pointer > is needed in order to preserve frame pointer value. > > To deal with exception handling, push/pop frame pointer is also used > surrounding call to subsequent subprog. For example, > subtree_root > subprog1 > ... > insn: call bpf_throw > ... > > After jit, we will have > subtree_root > insn: push r9 > subprog1 > ... > insn: push r9 > insn: call bpf_throw > insn: pop r9 > ... > insn: pop r9 > > exception_handler > pop r9 > ... > where r9 represents the fp for each subprog. Kumar, please review the interaction of priv_stack with exceptions. > > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > arch/x86/net/bpf_jit_comp.c | 87 ++++++++++++++++++++++++++++++++++--- > 1 file changed, 81 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index 06b080b61aa5..c264822c926b 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -325,6 +325,22 @@ struct jit_context { > /* Number of bytes that will be skipped on tailcall */ > #define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE) > > +static void push_r9(u8 **pprog) > +{ > + u8 *prog = *pprog; > + > + EMIT2(0x41, 0x51); /* push r9 */ > + *pprog = prog; > +} > + > +static void pop_r9(u8 **pprog) > +{ > + u8 *prog = *pprog; > + > + EMIT2(0x41, 0x59); /* pop r9 */ > + *pprog = prog; > +} > + > static void push_r12(u8 **pprog) > { > u8 *prog = *pprog; > @@ -491,7 +507,7 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) > */ > static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > bool tail_call_reachable, bool is_subprog, > - bool is_exception_cb) > + bool is_exception_cb, enum bpf_pstack_state pstack) enum bpf_priv_stack_mode priv_stack_mode > { > u8 *prog = *pprog; > > @@ -518,6 +534,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > * first restore those callee-saved regs from stack, before > * reusing the stack frame. > */ > + if (pstack) > + pop_r9(&prog); This is an unnecessary cognitive load, since readers need to remember absolute values of enum. Just use if (priv_stack_mode != NO_PRIV_STACK) > pop_callee_regs(&prog, all_callee_regs_used); > pop_r12(&prog); > /* Reset the stack frame. */ > @@ -1404,6 +1422,22 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) > *pprog = prog; > } > > +static void emit_private_frame_ptr(u8 **pprog, void *private_frame_ptr) > +{ > + u8 *prog = *pprog; > + > + /* movabs r9, private_frame_ptr */ > + emit_mov_imm64(&prog, X86_REG_R9, (long) private_frame_ptr >> 32, > + (u32) (long) private_frame_ptr); > + > + /* add <r9>, gs:[<off>] */ > + EMIT2(0x65, 0x4c); > + EMIT3(0x03, 0x0c, 0x25); > + EMIT((u32)(unsigned long)&this_cpu_off, 4); > + > + *pprog = prog; > +} > + > #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) > > #define __LOAD_TCC_PTR(off) \ > @@ -1421,20 +1455,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image > int insn_cnt = bpf_prog->len; > bool seen_exit = false; > u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; > + void __percpu *private_frame_ptr = NULL; > u64 arena_vm_start, user_vm_start; > + u32 orig_stack_depth, stack_depth; > int i, excnt = 0; > int ilen, proglen = 0; > u8 *prog = temp; > int err; > > + stack_depth = bpf_prog->aux->stack_depth; > + orig_stack_depth = round_up(stack_depth, 8); > + if (bpf_prog->pstack) { > + stack_depth = 0; > + if (bpf_prog->pstack == PSTACK_TREE_ROOT) > + private_frame_ptr = bpf_prog->private_stack_ptr + orig_stack_depth; > + } Same issue. switch (priv_stack_mode) { case PRIV_STACK_MAIN_PROG: priv_frame_ptr = bpf_prog->priv_stack_ptr + orig_stack_depth; fallthrough; case PRIV_STACK_SUB_PROG: stack_depth = 0; break; } would be easier to read. > + > arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); > user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); > > detect_reg_usage(insn, insn_cnt, callee_regs_used); > > - emit_prologue(&prog, bpf_prog->aux->stack_depth, > + emit_prologue(&prog, stack_depth, > bpf_prog_was_classic(bpf_prog), tail_call_reachable, > - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); > + bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb, > + bpf_prog->pstack); > /* Exception callback will clobber callee regs for its own use, and > * restore the original callee regs from main prog's stack frame. > */ > @@ -1454,6 +1499,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image > emit_mov_imm64(&prog, X86_REG_R12, > arena_vm_start >> 32, (u32) arena_vm_start); > > + if (bpf_prog->pstack == PSTACK_TREE_ROOT) { > + emit_private_frame_ptr(&prog, private_frame_ptr); > + } else if (bpf_prog->pstack == PSTACK_TREE_INTERNAL && orig_stack_depth) { > + /* r9 += orig_stack_depth */ > + maybe_emit_1mod(&prog, X86_REG_R9, true); > + if (is_imm8(orig_stack_depth)) > + EMIT3(0x83, add_1reg(0xC0, X86_REG_R9), orig_stack_depth); > + else > + EMIT2_off32(0x81, add_1reg(0xC0, X86_REG_R9), orig_stack_depth); > + } We've been open coding 'add' insn like this for way too long. Let's address this technical debt now. Please move case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: into helpers and use it here.