Re: [PATCH bpf-next 2/4] bpf, x64: Fix tailcall hierarchy

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Wed, 14 Feb 2024 15:16:41 -0800

On Tue, Feb 13, 2024 at 9:47 PM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote:
>
>
>
> On 2024/1/5 12:15, Alexei Starovoitov wrote:
> > On Thu, Jan 4, 2024 at 6:23 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote:
> >>
> >>
> >
> > Other alternatives?
>
> I've finish the POC of an alternative, which passed all tailcall
> selftests including these tailcall hierarchy ones.
>
> In this alternative, I use a new bpf_prog_run_ctx to wrap the original
> ctx and the tcc_ptr, then get the tcc_ptr and recover the original ctx
> in JIT.
>
> Then, to avoid breaking runtime with tailcall on other arch, I add an
> arch-related check bpf_jit_supports_tail_call_cnt_ptr() to determin
> whether to use bpf_prog_run_ctx.
>
> Here's the diff:
>
>  diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 4065bdcc5b2a4..56cea2676863e 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -259,7 +259,7 @@ struct jit_context {
>  /* Number of bytes emit_patch() needs to generate instructions */
>  #define X86_PATCH_SIZE         5
>  /* Number of bytes that will be skipped on tailcall */
> -#define X86_TAIL_CALL_OFFSET   (22 + ENDBR_INSN_SIZE)
> +#define X86_TAIL_CALL_OFFSET   (16 + ENDBR_INSN_SIZE)
>
>  static void push_r12(u8 **pprog)
>  {
> @@ -407,21 +407,19 @@ static void emit_prologue(u8 **pprog, u32
> stack_depth, bool ebpf_from_cbpf,
>         emit_nops(&prog, X86_PATCH_SIZE);
>         if (!ebpf_from_cbpf) {
>                 if (tail_call_reachable && !is_subprog) {
> -                       /* When it's the entry of the whole tailcall context,
> -                        * zeroing rax means initialising tail_call_cnt.
> -                        */
> -                       EMIT2(0x31, 0xC0);       /* xor eax, eax */
> -                       EMIT1(0x50);             /* push rax */
> -                       /* Make rax as ptr that points to tail_call_cnt. */
> -                       EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */
> -                       EMIT1_off32(0xE8, 2);    /* call main prog */
> -                       EMIT1(0x59);             /* pop rcx, get rid of tail_call_cnt */
> -                       EMIT1(0xC3);             /* ret */
> +                       /* Make rax as tcc_ptr. */
> +                       EMIT4(0x48, 0x8B, 0x47, 0x08); /* mov rax, qword ptr [rdi + 8] */
>                 } else {
> -                       /* Keep the same instruction size. */
> -                       emit_nops(&prog, 13);
> +                       /* Keep the same instruction layout. */
> +                       emit_nops(&prog, 4);
>                 }
>         }
> +       if (!is_subprog)
> +               /* Recover the original ctx. */
> +               EMIT3(0x48, 0x8B, 0x3F); /* mov rdi, qword ptr [rdi] */
> +       else
> +               /* Keep the same instruction layout. */
> +               emit_nops(&prog, 3);
>         /* Exception callback receives FP as third parameter */
>         if (is_exception_cb) {
>                 EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
> @@ -3152,6 +3150,12 @@ bool bpf_jit_supports_subprog_tailcalls(void)
>         return true;
>  }
>
> +/* Indicate the JIT backend supports tail call count pointer in
> tailcall context. */
> +bool bpf_jit_supports_tail_call_cnt_ptr(void)
> +{
> +       return true;
> +}
> +
>  void bpf_jit_free(struct bpf_prog *prog)
>  {
>         if (prog->jited) {
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 7671530d6e4e0..fea4326c27d31 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1919,6 +1919,11 @@ int bpf_prog_array_copy(struct bpf_prog_array
> *old_array,
>                         u64 bpf_cookie,
>                         struct bpf_prog_array **new_array);
>
> +struct bpf_prog_run_ctx {
> +       const void *ctx;
> +       u32 *tail_call_cnt;
> +};
> +
>  struct bpf_run_ctx {};
>
>  struct bpf_cg_run_ctx {
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 68fb6c8142fec..c1c035c44b4ab 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -629,6 +629,10 @@ typedef unsigned int (*bpf_dispatcher_fn)(const
> void *ctx,
>                                           unsigned int (*bpf_func)(const void *,
>                                                                    const struct bpf_insn *));
>
> +static __always_inline u32 __bpf_prog_run_dfunc(const struct bpf_prog
> *prog,
> +                                               const void *ctx,
> +                                               bpf_dispatcher_fn dfunc);
> +
>  static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>                                           const void *ctx,
>                                           bpf_dispatcher_fn dfunc)
> @@ -641,14 +645,14 @@ static __always_inline u32 __bpf_prog_run(const
> struct bpf_prog *prog,
>                 u64 start = sched_clock();
>                 unsigned long flags;
>
> -               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> +               ret = __bpf_prog_run_dfunc(prog, ctx, dfunc);
>                 stats = this_cpu_ptr(prog->stats);
>                 flags = u64_stats_update_begin_irqsave(&stats->syncp);
>                 u64_stats_inc(&stats->cnt);
>                 u64_stats_add(&stats->nsecs, sched_clock() - start);
>                 u64_stats_update_end_irqrestore(&stats->syncp, flags);
>         } else {
> -               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> +               ret = __bpf_prog_run_dfunc(prog, ctx, dfunc);
>         }
>         return ret;
>  }
> @@ -952,12 +956,31 @@ struct bpf_prog *bpf_int_jit_compile(struct
> bpf_prog *prog);
>  void bpf_jit_compile(struct bpf_prog *prog);
>  bool bpf_jit_needs_zext(void);
>  bool bpf_jit_supports_subprog_tailcalls(void);
> +bool bpf_jit_supports_tail_call_cnt_ptr(void);
>  bool bpf_jit_supports_kfunc_call(void);
>  bool bpf_jit_supports_far_kfunc_call(void);
>  bool bpf_jit_supports_exceptions(void);
>  void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64
> sp, u64 bp), void *cookie);
>  bool bpf_helper_changes_pkt_data(void *func);
>
> +static __always_inline u32 __bpf_prog_run_dfunc(const struct bpf_prog
> *prog,
> +                                               const void *ctx,
> +                                               bpf_dispatcher_fn dfunc)
> +{
> +       struct bpf_prog_run_ctx run_ctx = {};
> +       u32 ret, tcc = 0;
> +
> +       run_ctx.ctx = ctx;
> +       run_ctx.tail_call_cnt = &tcc;
> +
> +       if (bpf_jit_supports_tail_call_cnt_ptr() && prog->jited)
> +               ret = dfunc(&run_ctx, prog->insnsi, prog->bpf_func);
> +       else
> +               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);

This is no good either.
We cannot introduce two extra run-time checks before calling every bpf prog.
The solution must be overhead free for common cases.

Can we switch to percpu tail_call_cnt instead of on stack and %rax tricks ?

If that won't work, then we'd have to disable tail_calls from subprogs
in the verifier.