On Sun, Oct 20, 2024 at 12:14 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > With private stack support, each subprog can have stack with up to 512 > bytes. The limit of 512 bytes per subprog is kept to avoid increasing > verifier complexity since greater than 512 bytes will cause big verifier > change and increase memory consumption and verification time. > > If private stack is supported, for a bpf prog, esp. when it has > subprogs, private stack will be allocated for the main prog > and for each callback subprog. For example, > main_prog > subprog1 > calling helper > subprog10 (callback func) > subprog11 > subprog2 > calling helper > subprog10 (callback func) > subprog11 > > Separate private allocations for main_prog and callback_fn subprog10 > will make things easier since the helper function uses the kernel stack. > > In this patch, some tracing programs are allowed to use private > stack since tracing prog may be triggered in the middle of some other > prog runs. Additional subprog info is also collected for later to > allocate private stack for main prog and each callback functions. > > Note that if any tail_call is called in the prog (including all subprogs), > then private stack is not used. > > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/linux/bpf.h | 1 + > include/linux/bpf_verifier.h | 3 ++ > include/linux/filter.h | 1 + > kernel/bpf/core.c | 5 ++ > kernel/bpf/verifier.c | 100 ++++++++++++++++++++++++++++++----- > 5 files changed, 97 insertions(+), 13 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 0c216e71cec7..6ad8ace7075a 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1490,6 +1490,7 @@ struct bpf_prog_aux { > bool exception_cb; > bool exception_boundary; > bool is_extended; /* true if extended by freplace program */ > + bool priv_stack_eligible; > u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ > struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ > struct bpf_arena *arena; > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > index 4513372c5bc8..bcfe868e3801 100644 > --- a/include/linux/bpf_verifier.h > +++ b/include/linux/bpf_verifier.h > @@ -659,6 +659,8 @@ struct bpf_subprog_info { > * are used for bpf_fastcall spills and fills. > */ > s16 fastcall_stack_off; > + u16 subtree_stack_depth; > + u16 subtree_top_idx; > bool has_tail_call: 1; > bool tail_call_reachable: 1; > bool has_ld_abs: 1; > @@ -668,6 +670,7 @@ struct bpf_subprog_info { > bool args_cached: 1; > /* true if bpf_fastcall stack region is used by functions that can't be inlined */ > bool keep_fastcall_stack: 1; > + bool priv_stack_eligible: 1; > > u8 arg_cnt; > struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 7d7578a8eac1..3a21947f2fd4 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void); > bool bpf_jit_supports_ptr_xchg(void); > bool bpf_jit_supports_arena(void); > bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); > +bool bpf_jit_supports_private_stack(void); > u64 bpf_arch_uaddress_limit(void); > void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); > bool bpf_helper_changes_pkt_data(void *func); > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 233ea78f8f1b..14d9288441f2 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void) > return false; > } > > +bool __weak bpf_jit_supports_private_stack(void) > +{ > + return false; > +} > + > void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) > { > } > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index f514247ba8ba..45bea4066272 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem { > > #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 > > +#define BPF_PRIV_STACK_MIN_SUBTREE_SIZE 128 > + > static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); > static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); > static void invalidate_non_owning_refs(struct bpf_verifier_env *env); > @@ -5982,6 +5984,41 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, > strict); > } > > +static bool bpf_enable_private_stack(struct bpf_verifier_env *env) > +{ > + if (!bpf_jit_supports_private_stack()) > + return false; > + > + switch (env->prog->type) { > + case BPF_PROG_TYPE_KPROBE: > + case BPF_PROG_TYPE_TRACEPOINT: > + case BPF_PROG_TYPE_PERF_EVENT: > + case BPF_PROG_TYPE_RAW_TRACEPOINT: > + return true; > + case BPF_PROG_TYPE_TRACING: > + if (env->prog->expected_attach_type != BPF_TRACE_ITER) > + return true; > + fallthrough; > + default: > + return false; > + } > +} > + > +static bool is_priv_stack_supported(struct bpf_verifier_env *env) > +{ > + struct bpf_subprog_info *si = env->subprog_info; > + bool has_tail_call = false; > + > + for (int i = 0; i < env->subprog_cnt; i++) { > + if (si[i].has_tail_call) { > + has_tail_call = true; > + break; > + } > + } > + > + return !has_tail_call && bpf_enable_private_stack(env); > +} > + > static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) > { > if (env->prog->jit_requested) > @@ -5999,16 +6036,21 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) > * Since recursion is prevented by check_cfg() this algorithm > * only needs a local stack of MAX_CALL_FRAMES to remember callsites > */ > -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) > +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, > + bool check_priv_stack, bool priv_stack_supported) > { > struct bpf_subprog_info *subprog = env->subprog_info; > struct bpf_insn *insn = env->prog->insnsi; > int depth = 0, frame = 0, i, subprog_end; > bool tail_call_reachable = false; > + bool priv_stack_eligible = false; > int ret_insn[MAX_CALL_FRAMES]; > int ret_prog[MAX_CALL_FRAMES]; > - int j; > + int j, subprog_stack_depth; > + int orig_idx = idx; > > + if (check_priv_stack) > + subprog[idx].subtree_top_idx = idx; > i = subprog[idx].start; > process_func: > /* protect against potential stack overflow that might happen when > @@ -6030,18 +6072,33 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) > * tailcall will unwind the current stack frame but it will not get rid > * of caller's stack as shown on the example above. > */ > - if (idx && subprog[idx].has_tail_call && depth >= 256) { > + if (!check_priv_stack && idx && subprog[idx].has_tail_call && depth >= 256) { > verbose(env, > "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", > depth); > return -EACCES; > } > - depth += round_up_stack_depth(env, subprog[idx].stack_depth); > - if (depth > MAX_BPF_STACK) { > + subprog_stack_depth = round_up_stack_depth(env, subprog[idx].stack_depth); > + depth += subprog_stack_depth; > + if (!check_priv_stack && !priv_stack_supported && depth > MAX_BPF_STACK) { > verbose(env, "combined stack size of %d calls is %d. Too large\n", > frame + 1, depth); > return -EACCES; > } > + if (check_priv_stack) { > + if (subprog_stack_depth > MAX_BPF_STACK) { > + verbose(env, "stack size of subprog %d is %d. Too large\n", > + idx, subprog_stack_depth); > + return -EACCES; > + } > + > + if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) { > + subprog[orig_idx].priv_stack_eligible = true; > + env->prog->aux->priv_stack_eligible = priv_stack_eligible = true; > + } > + subprog[orig_idx].subtree_stack_depth = > + max_t(u16, subprog[orig_idx].subtree_stack_depth, depth); > + } > continue_func: > subprog_end = subprog[idx + 1].start; > for (; i < subprog_end; i++) { > @@ -6078,6 +6135,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) > next_insn = i + insn[i].imm + 1; > sidx = find_subprog(env, next_insn); > if (sidx < 0) { > + /* It is possible that callback func has been removed as dead code after > + * instruction rewrites, e.g. bpf_loop with cnt 0. > + */ > + if (check_priv_stack) > + continue; > + and this extra hack only because check_max_stack_depth() will be called the 2nd time ? Why call it twice at all ? Record everything in the first pass. > WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", > next_insn); > return -EFAULT; > @@ -6097,8 +6160,10 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) > } > i = next_insn; > idx = sidx; > + if (check_priv_stack) > + subprog[idx].subtree_top_idx = orig_idx; > > - if (subprog[idx].has_tail_call) > + if (!check_priv_stack && subprog[idx].has_tail_call) > tail_call_reachable = true; > > frame++; > @@ -6122,7 +6187,7 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) > } > subprog[ret_prog[j]].tail_call_reachable = true; > } > - if (subprog[0].tail_call_reachable) > + if (!check_priv_stack && subprog[0].tail_call_reachable) > env->prog->aux->tail_call_reachable = true; > > /* end of for() loop means the last insn of the 'subprog' > @@ -6137,14 +6202,18 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) > goto continue_func; > } > > -static int check_max_stack_depth(struct bpf_verifier_env *env) > +static int check_max_stack_depth(struct bpf_verifier_env *env, bool check_priv_stack, > + bool priv_stack_supported) > { > struct bpf_subprog_info *si = env->subprog_info; > + bool check_subprog; > int ret; > > for (int i = 0; i < env->subprog_cnt; i++) { > - if (!i || si[i].is_async_cb) { > - ret = check_max_stack_depth_subprog(env, i); > + check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb); why? This looks very suspicious. > + if (check_subprog) { > + ret = check_max_stack_depth_subprog(env, i, check_priv_stack, > + priv_stack_supported); > if (ret < 0) > return ret; > } > @@ -22303,7 +22372,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 > struct bpf_verifier_env *env; > int i, len, ret = -EINVAL, err; > u32 log_true_size; > - bool is_priv; > + bool is_priv, priv_stack_supported = false; > > /* no program is valid */ > if (ARRAY_SIZE(bpf_verifier_ops) == 0) > @@ -22430,8 +22499,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 > if (ret == 0) > ret = remove_fastcall_spills_fills(env); > > - if (ret == 0) > - ret = check_max_stack_depth(env); > + if (ret == 0) { > + priv_stack_supported = is_priv_stack_supported(env); > + ret = check_max_stack_depth(env, false, priv_stack_supported); > + } > > /* instruction rewrites happen after this point */ > if (ret == 0) > @@ -22465,6 +22536,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 > : false; > } > > + if (ret == 0 && priv_stack_supported) > + ret = check_max_stack_depth(env, true, true); > + > if (ret == 0) > ret = fixup_call_args(env); > > -- > 2.43.5 >