On Thu, Oct 31, 2024 at 8:12 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > With private stack support, each subprog can have stack with up to 512 > bytes. The limit of 512 bytes per subprog is kept to avoid increasing > verifier complexity since greater than 512 bytes will cause big verifier > change and increase memory consumption and verification time. > > If private stack is supported and certain stack size threshold is reached, > that subprog will have its own private stack allocated. > > In this patch, some tracing programs are allowed to use private > stack since tracing prog may be triggered in the middle of some other > prog runs. The supported tracing programs already have recursion check > such that if the same prog is running on the same cpu again, the nested > prog run will be skipped. This ensures bpf prog private stack is not > over-written. > > Note that if any tail_call is called in the prog (including all subprogs), > then private stack is not used. > > Function bpf_enable_priv_stack() return values include NO_PRIV_STACK, > PRIV_STACK_ADAPTIVE, PRIV_STACK_ALWAYS and negative errors. The > NO_PRIV_STACK represents priv stack not enable, PRIV_STACK_ADAPTIVE for > priv stack enabled with some conditions (e.g. stack size threshold), and > PRIV_STACK_ALWAYS for priv stack always enabled. The negative error > represents a verification failure. The PRIV_STACK_ALWAYS and negative error > will be used by later struct_ops progs. > > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/linux/bpf.h | 1 + > include/linux/bpf_verifier.h | 1 + > include/linux/filter.h | 1 + > kernel/bpf/core.c | 5 +++ > kernel/bpf/verifier.c | 75 ++++++++++++++++++++++++++++++++---- > 5 files changed, 75 insertions(+), 8 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index c3ba4d475174..8db3c5d7404b 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1523,6 +1523,7 @@ struct bpf_prog_aux { > bool exception_cb; > bool exception_boundary; > bool is_extended; /* true if extended by freplace program */ > + bool use_priv_stack; > u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ > struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ > struct bpf_arena *arena; > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > index 4513372c5bc8..bc28ce7996ac 100644 > --- a/include/linux/bpf_verifier.h > +++ b/include/linux/bpf_verifier.h > @@ -668,6 +668,7 @@ struct bpf_subprog_info { > bool args_cached: 1; > /* true if bpf_fastcall stack region is used by functions that can't be inlined */ > bool keep_fastcall_stack: 1; > + bool use_priv_stack: 1; > > u8 arg_cnt; > struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 7d7578a8eac1..3a21947f2fd4 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void); > bool bpf_jit_supports_ptr_xchg(void); > bool bpf_jit_supports_arena(void); > bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); > +bool bpf_jit_supports_private_stack(void); > u64 bpf_arch_uaddress_limit(void); > void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); > bool bpf_helper_changes_pkt_data(void *func); > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 233ea78f8f1b..14d9288441f2 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void) > return false; > } > > +bool __weak bpf_jit_supports_private_stack(void) > +{ > + return false; > +} > + > void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) > { > } > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 89b0a980d0f9..d3f4cbab97bc 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem { > > #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 > > +#define BPF_PRIV_STACK_MIN_SIZE 64 > + > static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); > static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); > static void invalidate_non_owning_refs(struct bpf_verifier_env *env); > @@ -6015,6 +6017,40 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, > strict); > } > > +#define NO_PRIV_STACK 0 > +#define PRIV_STACK_ADAPTIVE 1 > +#define PRIV_STACK_ALWAYS 2 Please use enum. > + > +static int bpf_enable_priv_stack(struct bpf_verifier_env *env) > +{ > + struct bpf_subprog_info *si; > + > + if (!bpf_jit_supports_private_stack()) > + return NO_PRIV_STACK; > + > + switch (env->prog->type) { > + case BPF_PROG_TYPE_KPROBE: > + case BPF_PROG_TYPE_TRACEPOINT: > + case BPF_PROG_TYPE_PERF_EVENT: > + case BPF_PROG_TYPE_RAW_TRACEPOINT: > + break; > + case BPF_PROG_TYPE_TRACING: > + if (env->prog->expected_attach_type != BPF_TRACE_ITER) > + break; > + fallthrough; > + default: > + return NO_PRIV_STACK; > + } Probably worth adding: if (!bpf_prog_check_recur(env->prog)) return NO_PRIV_STACK; and remove case BPF_PROG_TYPE_TRACING entry with comment that bpf_prog_check_recur() checks all prog types that use bpf trampoline while kprobe/tp/raw_tp don't use trampoline hence checked explicitly. > + > + si = env->subprog_info; > + for (int i = 0; i < env->subprog_cnt; i++) { > + if (si[i].has_tail_call) > + return NO_PRIV_STACK; > + } > + > + return PRIV_STACK_ADAPTIVE; > +} > + > static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) > { > if (env->prog->jit_requested) > @@ -6033,11 +6069,12 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) > * only needs a local stack of MAX_CALL_FRAMES to remember callsites > */ > static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, > - int *subtree_depth, int *depth_frame) > + int *subtree_depth, int *depth_frame, > + int priv_stack_supported) > { > struct bpf_subprog_info *subprog = env->subprog_info; > struct bpf_insn *insn = env->prog->insnsi; > - int depth = 0, frame = 0, i, subprog_end; > + int depth = 0, frame = 0, i, subprog_end, subprog_depth; > bool tail_call_reachable = false; > int ret_insn[MAX_CALL_FRAMES]; > int ret_prog[MAX_CALL_FRAMES]; > @@ -6070,11 +6107,23 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, > depth); > return -EACCES; > } > - depth += round_up_stack_depth(env, subprog[idx].stack_depth); > + subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); > + depth += subprog_depth; > if (depth > MAX_BPF_STACK && !*subtree_depth) { > *subtree_depth = depth; > *depth_frame = frame + 1; > } > + if (priv_stack_supported != NO_PRIV_STACK) { > + if (!subprog[idx].use_priv_stack) { > + if (subprog_depth > MAX_BPF_STACK) { > + verbose(env, "stack size of subprog %d is %d. Too large\n", > + idx, subprog_depth); > + return -EACCES; > + } > + if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) > + subprog[idx].use_priv_stack = true; > + } > + } > continue_func: > subprog_end = subprog[idx + 1].start; > for (; i < subprog_end; i++) { > @@ -6174,19 +6223,29 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) > { > struct bpf_subprog_info *si = env->subprog_info; > int ret, subtree_depth = 0, depth_frame; > + int priv_stack_supported; > + > + priv_stack_supported = bpf_enable_priv_stack(env); > + if (priv_stack_supported < 0) > + return priv_stack_supported; if it was enum, the compiler would have warned that the above is meaningless. > > for (int i = 0; i < env->subprog_cnt; i++) { > if (!i || si[i].is_async_cb) { > - ret = check_max_stack_depth_subprog(env, i, &subtree_depth, &depth_frame); > + ret = check_max_stack_depth_subprog(env, i, &subtree_depth, &depth_frame, > + priv_stack_supported); > if (ret < 0) > return ret; > } > } > - if (subtree_depth > MAX_BPF_STACK) { > - verbose(env, "combined stack size of %d calls is %d. Too large\n", > - depth_frame, subtree_depth); > - return -EACCES; > + if (priv_stack_supported == NO_PRIV_STACK) { > + if (subtree_depth > MAX_BPF_STACK) { no need for extra indent. Use: if (priv_stack_supported == NO_PRIV_STACK && subtree_depth > MAX_BPF_STACK) { > + verbose(env, "combined stack size of %d calls is %d. Too large\n", > + depth_frame, subtree_depth); > + return -EACCES; > + } > } > + if (si[0].use_priv_stack) > + env->prog->aux->use_priv_stack = true; > return 0; > } pw-bot: cr