On Tue, Mar 15, 2022 at 05:42:29PM -0700, Kui-Feng Lee wrote: > BPF trampolines will create a bpf_trace_run_ctx on their stacks, and > set/reset the current bpf_run_ctx whenever calling/returning from a > bpf_prog. > > Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx> > --- > arch/x86/net/bpf_jit_comp.c | 32 ++++++++++++++++++++++++++++++++ > include/linux/bpf.h | 12 ++++++++---- > kernel/bpf/syscall.c | 4 ++-- > kernel/bpf/trampoline.c | 21 +++++++++++++++++---- > 4 files changed, 59 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index 1228e6e6a420..29775a475513 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -1748,10 +1748,33 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, > { > u8 *prog = *pprog; > u8 *jmp_insn; > + int ctx_cookie_off = offsetof(struct bpf_trace_run_ctx, bpf_cookie); > struct bpf_prog *p = l->prog; > > + EMIT1(0x52); /* push rdx */ Why save/restore rdx? > + > + /* mov rdi, 0 */ > + emit_mov_imm64(&prog, BPF_REG_1, 0, 0); > + > + /* Prepare struct bpf_trace_run_ctx. > + * sub rsp, sizeof(struct bpf_trace_run_ctx) > + * mov rax, rsp > + * mov QWORD PTR [rax + ctx_cookie_off], rdi > + */ How about the following instead: sub rsp, sizeof(struct bpf_trace_run_ctx) mov qword ptr [rsp + ctx_cookie_off], 0 ? > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct bpf_trace_run_ctx)); > + EMIT3(0x48, 0x89, 0xE0); > + EMIT4(0x48, 0x89, 0x78, ctx_cookie_off); > + > + /* mov rdi, rsp */ > + EMIT3(0x48, 0x89, 0xE7); > + /* mov QWORD PTR [rdi + sizeof(struct bpf_trace_run_ctx)], rax */ > + emit_stx(&prog, BPF_DW, BPF_REG_1, BPF_REG_0, sizeof(struct bpf_trace_run_ctx)); why not to do: mov qword ptr[rsp + sizeof(struct bpf_trace_run_ctx)], rsp ? > + > /* arg1: mov rdi, progs[i] */ > emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); > + /* arg2: mov rsi, rsp (struct bpf_run_ctx *) */ > + EMIT3(0x48, 0x89, 0xE6); > + > if (emit_call(&prog, > p->aux->sleepable ? __bpf_prog_enter_sleepable : > __bpf_prog_enter, prog)) > @@ -1797,11 +1820,20 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, > emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); > /* arg2: mov rsi, rbx <- start time in nsec */ > emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); > + /* arg3: mov rdx, rsp (struct bpf_run_ctx *) */ > + EMIT3(0x48, 0x89, 0xE2); > if (emit_call(&prog, > p->aux->sleepable ? __bpf_prog_exit_sleepable : > __bpf_prog_exit, prog)) > return -EINVAL; > > + /* pop struct bpf_trace_run_ctx > + * add rsp, sizeof(struct bpf_trace_run_ctx) > + */ > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct bpf_trace_run_ctx)); the sub rsp; add rsp pair for every prog call will add up. Could you do sub rsp once at the beginning of trampoline? And move mov qword ptr[rsp + sizeof(struct bpf_trace_run_ctx)], rsp to the beginning as well? > + > + EMIT1(0x5A); /* pop rdx */ > + > *pprog = prog; > return 0; > } > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 3dcae8550c21..d20a23953696 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -681,6 +681,8 @@ struct bpf_tramp_links { > int nr_links; > }; > > +struct bpf_trace_run_ctx; > + > /* Different use cases for BPF trampoline: > * 1. replace nop at the function entry (kprobe equivalent) > * flags = BPF_TRAMP_F_RESTORE_REGS > @@ -707,10 +709,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i > struct bpf_tramp_links *tlinks, > void *orig_call); > /* these two functions are called from generated trampoline */ > -u64 notrace __bpf_prog_enter(struct bpf_prog *prog); > -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); > -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); > -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); > +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_trace_run_ctx *run_ctx); > +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_trace_run_ctx *run_ctx); > +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_trace_run_ctx *run_ctx); > +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, > + struct bpf_trace_run_ctx *run_ctx); > void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); > void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); > > @@ -1291,6 +1294,7 @@ struct bpf_cg_run_ctx { > struct bpf_trace_run_ctx { > struct bpf_run_ctx run_ctx; > u64 bpf_cookie; > + struct bpf_run_ctx *saved_run_ctx; > }; > > static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index fecfc803785d..a289ef55ea17 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -4793,13 +4793,13 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) > return -EINVAL; > } > > - if (!__bpf_prog_enter_sleepable(prog)) { > + if (!__bpf_prog_enter_sleepable(prog, NULL)) { > /* recursion detected */ > bpf_prog_put(prog); > return -EBUSY; > } > attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); > - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); > + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, NULL); > bpf_prog_put(prog); > return 0; > #endif > diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c > index 54c695d49ec9..0b050aa2f159 100644 > --- a/kernel/bpf/trampoline.c > +++ b/kernel/bpf/trampoline.c > @@ -580,9 +580,12 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) > * [2..MAX_U64] - execute bpf prog and record execution time. > * This is start time. > */ > -u64 notrace __bpf_prog_enter(struct bpf_prog *prog) > +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_trace_run_ctx *run_ctx) > __acquires(RCU) > { > + if (run_ctx) > + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); We can remove these branches from critical path if we path actual run_ctx instead of NULL in bpf_sys_bpf, right? > + > rcu_read_lock(); > migrate_disable(); > if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { > @@ -614,17 +617,23 @@ static void notrace update_prog_stats(struct bpf_prog *prog, > } > } > > -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) > +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_trace_run_ctx *run_ctx) > __releases(RCU) > { > + if (run_ctx) > + bpf_reset_run_ctx(run_ctx->saved_run_ctx); > + > update_prog_stats(prog, start); > __this_cpu_dec(*(prog->active)); > migrate_enable(); > rcu_read_unlock(); > } > > -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) > +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_trace_run_ctx *run_ctx) > { > + if (run_ctx) > + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); > + > rcu_read_lock_trace(); > migrate_disable(); > might_fault(); > @@ -635,8 +644,12 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) > return bpf_prog_start_time(); > } > > -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) > +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, > + struct bpf_trace_run_ctx *run_ctx) > { > + if (run_ctx) > + bpf_reset_run_ctx(run_ctx->saved_run_ctx); > + > update_prog_stats(prog, start); > __this_cpu_dec(*(prog->active)); > migrate_enable(); > -- > 2.30.2 > --