Re: [PATCH bpf-next v3 2/5] bpf: Collect stack depth information

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Mon, 30 Sep 2024 07:42:22 -0700

On Thu, Sep 26, 2024 at 4:45 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote:
>
> Private stack memory allocation is based on call subtrees. For example,
>
>   main_prog     // stack size 50
>     subprog1    // stack size 50
>       subprog2  // stack size 50
>     subprog3    // stack size 50
>
> Overall allocation size should be 150 bytes (stacks from main_prog,
> subprog1 and subprog2).
>
> To simplify jit, the root of subtrees is either the main prog
> or any callback func. For example,
>
>   main_prog
>     subprog1    // callback subprog10
>       ...
>         subprog10
>           subprog11

This is an odd example.
We have MAX_CALL_FRAMES = 8
So there cannot be more than 512 * 8 = 4k of stack.

>
> In this case, two subtrees exist. One root is main_prog and the other
> root is subprog10.
>
> The private stack is used only if
>  - the subtree stack size is greater than 128 bytes and
>    smaller than or equal to U16_MAX, and

U16 limit looks odd too.
Since we're not bumping MAX_CALL_FRAMES at the moment
let's limit to 4k.

>  - the prog type is kprobe, tracepoint, perf_event, raw_tracepoint
>    and tracing, and
>  - jit supports private stack, and
>  - no tail call in the main prog and all subprogs
>
> The restriction of no tail call is due to the following two reasons:
>  - to avoid potential large memory consumption. Currently maximum tail
>    call count is MAX_TAIL_CALL_CNT=33. Considering private stack memory
>    allocation is per-cpu based. It will be a very large memory consumption
>    to support current MAX_TAIL_CALL_CNT.
>  - if the tailcall in the callback function, it is not easy to pass
>    the tail call cnt to the callback function and the tail call cnt
>    is needed to find proper offset for private stack.
> So to avoid complexity, private stack does not support tail call
> for now.
>
> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx>
> ---
>  include/linux/bpf.h          |  3 +-
>  include/linux/bpf_verifier.h |  3 ++
>  kernel/bpf/verifier.c        | 81 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 86 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 62909fbe9e48..156b9516d9f6 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1566,7 +1566,8 @@ struct bpf_prog {
>                                 call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
>                                 call_get_func_ip:1, /* Do we call get_func_ip() */
>                                 tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
> -                               sleepable:1;    /* BPF program is sleepable */
> +                               sleepable:1,    /* BPF program is sleepable */
> +                               pstack_eligible:1; /* Candidate for private stacks */

I'm struggling with this abbreviation.
pstack is just too ambiguous.
It means 'pointer stack' in perf.
'man pstack' means 'print stack of a process'.
Let's use something more concrete.

How about priv_stack ?
And use it this way in all other names.
Instead of:
calc_private_stack_alloc_subprog
do:
calc_priv_stack_alloc_subprog

>         enum bpf_prog_type      type;           /* Type of BPF program */
>         enum bpf_attach_type    expected_attach_type; /* For some prog types */
>         u32                     len;            /* Number of filter blocks */
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 4513372c5bc8..63df10f4129e 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>          * are used for bpf_fastcall spills and fills.
>          */
>         s16 fastcall_stack_off;
> +       u16 subtree_stack_depth;
> +       u16 subtree_top_idx;
>         bool has_tail_call: 1;
>         bool tail_call_reachable: 1;
>         bool has_ld_abs: 1;
> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>         bool args_cached: 1;
>         /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>         bool keep_fastcall_stack: 1;
> +       bool pstack_eligible:1;
>
>         u8 arg_cnt;
>         struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 97700e32e085..69e17cb22037 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>
>  #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
>
> +#define BPF_PSTACK_MIN_SUBTREE_SIZE    128
> +
>  static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>  static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>  static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
> @@ -6192,6 +6194,82 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>         return 0;
>  }
>
> +static int calc_private_stack_alloc_subprog(struct bpf_verifier_env *env, int idx)
> +{
> +       struct bpf_subprog_info *subprog = env->subprog_info;
> +       struct bpf_insn *insn = env->prog->insnsi;
> +       int depth = 0, frame = 0, i, subprog_end;
> +       int ret_insn[MAX_CALL_FRAMES];
> +       int ret_prog[MAX_CALL_FRAMES];
> +       int ps_eligible = 0;
> +       int orig_idx = idx;
> +
> +       subprog[idx].subtree_top_idx = idx;
> +       i = subprog[idx].start;
> +
> +process_func:
> +       depth += round_up_stack_depth(env, subprog[idx].stack_depth);
> +       if (depth > U16_MAX)
> +               return -EACCES;
> +
> +       if (!ps_eligible && depth >= BPF_PSTACK_MIN_SUBTREE_SIZE) {
> +               subprog[orig_idx].pstack_eligible = true;
> +               ps_eligible = true;
> +       }
> +       subprog[orig_idx].subtree_stack_depth =
> +               max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
> +
> +continue_func:
> +       subprog_end = subprog[idx + 1].start;
> +       for (; i < subprog_end; i++) {
> +               int next_insn, sidx;
> +
> +               if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
> +                       continue;
> +               /* remember insn and function to return to */
> +               ret_insn[frame] = i + 1;
> +               ret_prog[frame] = idx;
> +
> +               /* find the callee */
> +               next_insn = i + insn[i].imm + 1;
> +               sidx = find_subprog(env, next_insn);
> +               if (subprog[sidx].is_cb) {
> +                       if (!bpf_pseudo_call(insn + i))
> +                               continue;
> +               }
> +               i = next_insn;
> +               idx = sidx;
> +               subprog[idx].subtree_top_idx = orig_idx;
> +
> +               frame++;
> +               goto process_func;
> +       }
> +       if (frame == 0)
> +               return ps_eligible;
> +       depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
> +       frame--;
> +       i = ret_insn[frame];
> +       idx = ret_prog[frame];
> +       goto continue_func;
> +}

This looks very similar to check_max_stack_depth_subprog()
Can you share the code?