A kfunc bpf_prog_call() is introduced such that it can call another bpf prog within a bpf prog. It has the same parameters as bpf_tail_call() but acts like a normal function call. But bpf_prog_call() could recurse to the caller prog itself. So if a bpf prog calls bpf_prog_call(), that bpf prog will use private stacks with maximum recursion level 4. The 4 level recursion should work for most cases. bpf_prog_call() cannot be used if tail_call exists in the same prog since tail_call does not use private stack. If both prog_call and tail_call in the same prog, verification will fail. Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> --- include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 7 +++++-- kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++---- 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f22ddb423fd0..952cb398eb30 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1493,6 +1493,7 @@ struct bpf_prog_aux { bool exception_cb; bool exception_boundary; bool priv_stack_eligible; + bool has_prog_call; struct bpf_arena *arena; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; @@ -1929,6 +1930,7 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 +#define BPF_MAX_PRIV_STACK_NEST_LEVEL 4 /* Maximum number of loops for bpf_loop and bpf_iter_num. * It's enum to expose it (and thus make it discoverable) through BTF. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f79d951a061f..0d2c97f63ecf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2426,10 +2426,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->aux->priv_stack_mode = NO_PRIV_STACK; } else { void __percpu *priv_stack_ptr; + int nest_level = 1; + if (fp->aux->has_prog_call) + nest_level = BPF_MAX_PRIV_STACK_NEST_LEVEL; fp->aux->priv_stack_mode = PRIV_STACK_ROOT_PROG; - priv_stack_ptr = - __alloc_percpu_gfp(fp->aux->stack_depth, 8, GFP_KERNEL); + priv_stack_ptr = __alloc_percpu_gfp( + fp->aux->stack_depth * nest_level, 8, GFP_KERNEL); if (!priv_stack_ptr) { *err = -ENOMEM; return fp; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4053f279ed4c..9cc880dc213e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2749,6 +2749,25 @@ __bpf_kfunc void bpf_rcu_read_unlock(void) rcu_read_unlock(); } +__bpf_kfunc int bpf_prog_call(void *ctx, struct bpf_map *p__map, u32 index) +{ + struct bpf_array *array; + struct bpf_prog *prog; + + if (p__map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + return -EINVAL; + + array = container_of(p__map, struct bpf_array, map); + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + prog = READ_ONCE(array->ptrs[index]); + if (!prog) + return -ENOENT; + + return bpf_prog_run(prog, ctx); +} + struct bpf_throw_ctx { struct bpf_prog_aux *aux; u64 sp; @@ -3035,6 +3054,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +BTF_ID_FLAGS(func, bpf_prog_call) BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 46b0c277c6a8..e3d9820618a1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5986,6 +5986,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, static bool bpf_enable_private_stack(struct bpf_prog *prog) { + if (prog->aux->has_prog_call) + return true; + if (!bpf_jit_supports_private_stack()) return false; @@ -6092,7 +6095,9 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, return -EACCES; } - if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) { + if (!priv_stack_eligible && + (depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE || + env->prog->aux->has_prog_call)) { subprog[orig_idx].priv_stack_eligible = true; env->prog->aux->priv_stack_eligible = priv_stack_eligible = true; } @@ -6181,8 +6186,13 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, } subprog[ret_prog[j]].tail_call_reachable = true; } - if (!check_priv_stack && subprog[0].tail_call_reachable) + if (!check_priv_stack && subprog[0].tail_call_reachable) { + if (env->prog->aux->has_prog_call) { + verbose(env, "cannot do prog call and tail call in the same prog\n"); + return -EINVAL; + } env->prog->aux->tail_call_reachable = true; + } /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT @@ -11322,6 +11332,7 @@ enum special_kfunc_type { KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, KF_bpf_session_cookie, + KF_bpf_prog_call, }; BTF_SET_START(special_kfunc_set) @@ -11387,6 +11398,7 @@ BTF_ID(func, bpf_session_cookie) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_prog_call) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11433,6 +11445,11 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) return KF_ARG_PTR_TO_CTX; + if (meta->func_id == special_kfunc_list[KF_bpf_prog_call] && argno == 0) { + env->prog->aux->has_prog_call = true; + return KF_ARG_PTR_TO_CTX; + } + /* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of @@ -20009,6 +20026,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; + int nest_level = 1; if (env->subprog_cnt <= 1) return 0; @@ -20099,9 +20117,13 @@ static int jit_subprogs(struct bpf_verifier_env *env) } else if (!subtree_stack_depth) { func[i]->aux->priv_stack_mode = PRIV_STACK_ROOT_PROG; } else { + if (env->prog->aux->has_prog_call) { + func[i]->aux->has_prog_call = true; + nest_level = BPF_MAX_PRIV_STACK_NEST_LEVEL; + } func[i]->aux->priv_stack_mode = PRIV_STACK_ROOT_PROG; - priv_stack_ptr = - __alloc_percpu_gfp(subtree_stack_depth, 8, GFP_KERNEL); + priv_stack_ptr = __alloc_percpu_gfp( + subtree_stack_depth * nest_level, 8, GFP_KERNEL); if (!priv_stack_ptr) { err = -ENOMEM; goto out_free; -- 2.43.5