Enable handling of callbacks that throw within helpers and kfuncs taking them. The problem with helpers is that exception state can be manipulated by a tracing program which is invoked between the checking of exception after calling callback and returning back to the program. Hence, helpers always return -EJUKEBOX whenever they detect an exception. Only 1 kfunc takes a callback (bpf_rbtree_add), so it is updated to be notrace to avoid this pitfall. This allows us to use bpf_get_exception to detect the thrown case, and in case we miss exception event we use return code to unwind. TODO: It might be possible to simply check return code in case of helpers or kfuncs taking callbacks and check_helper_ret_code = true, and not bother with exception state. This should lead to less code being generated per-callsite. For all other cases, we can rely on bpf_get_exception, and ensure that helper/kfunc uses notrace to avoid invocation of tracing programs that clobber exception state on return path. But make this change in v2 after ensuring current->bpf_exception_thrown approach is acceptable. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> --- include/linux/bpf_verifier.h | 1 + kernel/bpf/arraymap.c | 4 +- kernel/bpf/bpf_iter.c | 2 + kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 18 +++-- kernel/bpf/ringbuf.c | 4 ++ kernel/bpf/task_iter.c | 2 + kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++++++++++ 8 files changed, 156 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bc067223d3ee..a5346a2b7e68 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -485,6 +485,7 @@ struct bpf_insn_aux_data { bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ + bool skip_patch_call_imm; /* Skip patch_call_imm phase in do_misc_fixups */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index de0eadf8706f..6c0c5e726ebf 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -711,6 +711,8 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, (u64)(long)val, (u64)(long)callback_ctx, 0); + if (bpf_get_exception()) + ret = -EJUKEBOX; /* return value: 0 - continue, 1 - stop and return */ if (ret) break; @@ -718,7 +720,7 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback if (is_percpu) migrate_enable(); - return num_elems; + return ret == -EJUKEBOX ? ret : num_elems; } static u64 array_map_mem_usage(const struct bpf_map *map) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 96856f130cbf..6e4e4b6213f8 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -759,6 +759,8 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, for (i = 0; i < nr_loops; i++) { ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); + if (bpf_get_exception()) + return -EJUKEBOX; /* return value: 0 - continue, 1 - stop and return */ if (ret) return i + 1; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 00c253b84bf5..5e70151e0414 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2178,6 +2178,8 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)val, (u64)(long)callback_ctx, 0); + if (bpf_get_exception()) + ret = -EJUKEBOX; /* return value: 0 - continue, 1 - stop and return */ if (ret) { rcu_read_unlock(); @@ -2189,7 +2191,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ out: if (is_percpu) migrate_enable(); - return num_elems; + return ret == -EJUKEBOX ? ret : num_elems; } static u64 htab_map_mem_usage(const struct bpf_map *map) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 89e70907257c..82db3a64fa3f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1982,10 +1982,11 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF - * program + * program. + * Marked notrace to avoid clobbering of exception state in current by BPF + * programs. */ -static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, - void *less) +static notrace void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, void *less) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; bpf_callback_t cb = (bpf_callback_t)less; @@ -1993,8 +1994,13 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, bool leftmost = true; while (*link) { + u64 cb_res; + parent = *link; - if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { + cb_res = cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0); + if (bpf_get_exception()) + return; + if (cb_res) { link = &parent->rb_left; } else { link = &parent->rb_right; @@ -2007,8 +2013,8 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, (struct rb_root_cached *)root, leftmost); } -__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, - bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) +__bpf_kfunc notrace void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) { __bpf_rbtree_add(root, node, (void *)less); } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 875ac9b698d9..7f6764ae4fff 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -766,6 +766,10 @@ BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + if (bpf_get_exception()) { + ret = -EJUKEBOX; + goto schedule_work_return; + } __bpf_user_ringbuf_sample_release(rb, size, flags); } ret = samples - discarded_samples; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..6e8667f03784 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -807,6 +807,8 @@ BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, callback_fn((u64)(long)task, (u64)(long)vma, (u64)(long)callback_ctx, 0, 0); ret = 0; + if (bpf_get_exception()) + ret = -EJUKEBOX; } bpf_mmap_unlock_mm(work, mm); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6981d8817c71..07d808b05044 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9053,6 +9053,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + /* For each helper call which invokes a callback which may throw, it + * will propagate the thrown exception to us. For helpers, we check the + * return code in addition to exception state, as it may be reset + * between detection and return within kernel. Note that we don't + * include async callbacks (passed to bpf_timer_set_callback) because + * exceptions won't be propagated. + */ + if (is_callback_calling_function(meta.func_id) && + meta.func_id != BPF_FUNC_timer_set_callback) { + struct bpf_throw_state *ts = &env->insn_aux_data[insn_idx].throw_state; + /* Check for -EJUKEBOX in case exception state is clobbered by + * some other program executing between bpf_get_exception and + * return from helper. + */ + if (base_type(fn->ret_type) == RET_INTEGER) + ts->check_helper_ret_code = true; + } + switch (func_id) { case BPF_FUNC_tail_call: err = check_reference_leak(env, false); @@ -17691,6 +17709,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; } + if (env->insn_aux_data[i + delta].skip_patch_call_imm) + continue; + if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) @@ -18177,6 +18198,94 @@ static bool is_bpf_throw_call(struct bpf_insn *insn) insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } +static struct bpf_prog *rewrite_bpf_call(struct bpf_verifier_env *env, + int position, + s32 stack_base, + struct bpf_throw_state *tstate, + u32 *cnt) +{ + s32 r0_offset = stack_base + 0 * BPF_REG_SIZE; + struct bpf_insn_aux_data *aux_data; + struct bpf_insn insn_buf[] = { + env->prog->insnsi[position], + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, r0_offset), + BPF_EMIT_CALL(bpf_get_exception), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, r0_offset), + BPF_JMP32_IMM(BPF_JNE, BPF_REG_0, -EJUKEBOX, 3), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + struct bpf_prog *new_prog; + int type, tsubprog = -1; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + bool ret_code; + + type = tstate->type; + ret_code = tstate->check_helper_ret_code; + if (type == BPF_THROW_OUTER) + insn_buf[4] = insn_buf[9] = BPF_EMIT_CALL(bpf_reset_exception); + if (type == BPF_THROW_INNER) + insn_buf[9] = BPF_EMIT_CALL(bpf_throw); + + /* We need to fix offset of the pseudo call after patching. + * Note: The actual call instruction is at insn_buf[0] + */ + if (bpf_pseudo_call(&insn_buf[0])) { + tsubprog = find_subprog(env, position + insn_buf[0].imm + 1); + if (WARN_ON_ONCE(tsubprog < 0)) + return NULL; + } + /* For helpers, the code path between checking bpf_get_exception and + * returning may involve invocation of tracing progs which reset + * exception state, so also use the return value to invoke exception + * path. Otherwise, exception event from callback is lost. + */ + if (ret_code) + *cnt = ARRAY_SIZE(insn_buf); + else + *cnt = ARRAY_SIZE(insn_buf) - 4; + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + if (!new_prog) + return new_prog; + + /* Note: The actual call instruction is at insn_buf[0] */ + if (bpf_pseudo_call(&insn_buf[0])) { + callback_start = env->subprog_info[tsubprog].start; + call_insn_offset = position + 0; + callback_offset = callback_start - call_insn_offset - 1; + new_prog->insnsi[call_insn_offset].imm = callback_offset; + } + + aux_data = env->insn_aux_data; + /* Note: We already patched in call at insn_buf[2], insn_buf[9]. */ + aux_data[position + 2].skip_patch_call_imm = true; + if (ret_code) + aux_data[position + 9].skip_patch_call_imm = true; + /* Note: For BPF_THROW_OUTER, we already patched in call at insn_buf[4] */ + if (type == BPF_THROW_OUTER) + aux_data[position + 4].skip_patch_call_imm = true; + return new_prog; +} + +static bool is_throwing_bpf_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_insn_aux_data *insn_aux) +{ + if (insn->code != (BPF_JMP | BPF_CALL)) + return false; + if (insn->src_reg == BPF_PSEUDO_CALL || + insn->src_reg == BPF_PSEUDO_KFUNC_CALL || + insn->src_reg == 0) + return insn_aux->throw_state.type != BPF_THROW_NONE; + return false; +} + /* For all sub-programs in the program (including main) check * insn_aux_data to see if there are any instructions that need to be * transformed into an instruction sequence. E.g. bpf_loop calls that @@ -18228,6 +18337,26 @@ static int do_misc_rewrites(struct bpf_verifier_env *env) new_prog = rewrite_bpf_throw_call(env, i + delta, throw_state, &cnt); if (!new_prog) return -ENOMEM; + } else if (is_throwing_bpf_call(env, insn, insn_aux)) { + struct bpf_throw_state *throw_state = &insn_aux->throw_state; + + stack_depth_extra = max_t(u16, stack_depth_extra, + BPF_REG_SIZE * 1 + stack_depth_roundup); + + /* The verifier was able to prove that the throwing call + * was unreachable, hence it must have not been seen and + * will be removed by opt_remove_dead_code. + */ + if (throw_state->type == BPF_THROW_NONE) { + WARN_ON_ONCE(insn_aux->seen); + goto skip; + } + + new_prog = rewrite_bpf_call(env, i + delta, + -(stack_depth + stack_depth_extra), + throw_state, &cnt); + if (!new_prog) + return -ENOMEM; } skip: -- 2.40.0