On Thu, Oct 31, 2024 at 8:10 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > For struct_ops progs, whether a particular prog will use private stack > or not (prog->aux->use_priv_stack) will be set before actual insn-level > verification for that prog. One particular implementation is to > piggyback on struct_ops->check_member(). The next patch will have an > example for this. The struct_ops->check_member() will set > prog->aux->use_priv_stack to be true which enables private stack > usage with ignoring BPF_PRIV_STACK_MIN_SIZE limit. > > If use_priv_stack is true for a particular struct_ops prog, bpf > trampoline will need to do recursion checks (one level at this point) > to avoid stack overwrite. A field (recursion_skipped()) is added to > bpf_prog_aux structure such that if bpf_prog->aux->recursion_skipped > is set by the struct_ops subsystem, the function will be called > to terminate the prog run, collect related info, etc. > > Acked-by: Tejun Heo <tj@xxxxxxxxxx> > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/linux/bpf.h | 1 + > include/linux/bpf_verifier.h | 1 + > kernel/bpf/trampoline.c | 4 ++++ > kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++---- > 4 files changed, 38 insertions(+), 4 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 8a3ea7440a4a..7a34108c6974 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1528,6 +1528,7 @@ struct bpf_prog_aux { > u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ > struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ > struct bpf_arena *arena; > + void (*recursion_skipped)(struct bpf_prog *prog); /* callback if recursion is skipped */ The name doesn't fit. The recursion wasn't skipped. It's the execution of the program that was skipped. 'recursion_detected' or 'recursion_disallowed' would be a better name. > /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ > const struct btf_type *attach_func_proto; > /* function name for valid attach_btf_id */ > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > index bc28ce7996ac..ff0fba935f89 100644 > --- a/include/linux/bpf_verifier.h > +++ b/include/linux/bpf_verifier.h > @@ -889,6 +889,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) > case BPF_PROG_TYPE_TRACING: > return prog->expected_attach_type != BPF_TRACE_ITER; > case BPF_PROG_TYPE_STRUCT_OPS: > + return prog->aux->use_priv_stack; > case BPF_PROG_TYPE_LSM: > return false; > default: > diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c > index 9f36c049f4c2..a84e60efbf89 100644 > --- a/kernel/bpf/trampoline.c > +++ b/kernel/bpf/trampoline.c > @@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram > > if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { > bpf_prog_inc_misses_counter(prog); > + if (prog->aux->recursion_skipped) > + prog->aux->recursion_skipped(prog); > return 0; > } > return bpf_prog_start_time(); > @@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, > > if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { > bpf_prog_inc_misses_counter(prog); > + if (prog->aux->recursion_skipped) > + prog->aux->recursion_skipped(prog); > return 0; > } > return bpf_prog_start_time(); > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 30e74db6a85f..865191c5d21b 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -6023,17 +6023,31 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, > > static int bpf_enable_priv_stack(struct bpf_verifier_env *env) > { > + bool force_priv_stack = env->prog->aux->use_priv_stack; > struct bpf_subprog_info *si; > + int ret; > + > + if (!bpf_jit_supports_private_stack()) { > + if (force_priv_stack) { > + verbose(env, "Private stack not supported by jit\n"); > + return -EACCES; > + } This logic would fit better in the patch 2. Less code churn and the whole approach is easier to understand. I don't like this inband signaling. Now I see why you had that weird <0 check in patch 2 :( This is ugly. May be it should be a separate bool request_priv_stack:1 that struct_ops callback will set and it will clean up this logic. > > - if (!bpf_jit_supports_private_stack()) > return NO_PRIV_STACK; > + } > > + ret = PRIV_STACK_ADAPTIVE; > switch (env->prog->type) { > case BPF_PROG_TYPE_KPROBE: > case BPF_PROG_TYPE_TRACEPOINT: > case BPF_PROG_TYPE_PERF_EVENT: > case BPF_PROG_TYPE_RAW_TRACEPOINT: > break; > + case BPF_PROG_TYPE_STRUCT_OPS: > + if (!force_priv_stack) > + return NO_PRIV_STACK; > + ret = PRIV_STACK_ALWAYS; > + break; > case BPF_PROG_TYPE_TRACING: > if (env->prog->expected_attach_type != BPF_TRACE_ITER) > break; > @@ -6044,11 +6058,18 @@ static int bpf_enable_priv_stack(struct bpf_verifier_env *env) > > si = env->subprog_info; > for (int i = 0; i < env->subprog_cnt; i++) { > - if (si[i].has_tail_call) > + if (si[i].has_tail_call) { > + if (ret == PRIV_STACK_ALWAYS) { > + verbose(env, > + "Private stack not supported due to tail call presence\n"); > + return -EACCES; > + } > + > return NO_PRIV_STACK; > + } > } > > - return PRIV_STACK_ADAPTIVE; > + return ret; > } > > static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) > @@ -6121,7 +6142,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, > idx, subprog_depth); > return -EACCES; > } > - if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) { > + if (priv_stack_supported == PRIV_STACK_ALWAYS || > + subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) { > subprog[idx].use_priv_stack = true; > subprog_visited[idx] = 1; > } > @@ -6271,6 +6293,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) > depth_frame, subtree_depth); > return -EACCES; > } > + if (orig_priv_stack_supported == PRIV_STACK_ALWAYS) { > + verbose(env, > + "Private stack not supported due to possible nested subprog run\n"); > + ret = -EACCES; > + goto out; > + } > if (orig_priv_stack_supported == PRIV_STACK_ADAPTIVE) { > for (int i = 0; i < env->subprog_cnt; i++) > si[i].use_priv_stack = false; > -- > 2.43.5 >