A new bpf_type_flag MEM_RCU is added to indicate a PTR_TO_BTF_ID object access needing rcu_read_lock protection. The rcu protection is not needed for non-sleepable program. So various verification checking is only done for sleepable programs. In particular, only the following insns can be inside bpf_rcu_read_lock() region: - any non call insns except BPF_ABS/BPF_IND - non sleepable helpers and kfuncs. Also, bpf_*_storage_get() helper's 5th hidden argument (for memory allocation flag) should be GFP_ATOMIC. If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of this pointer and the load which gets this pointer needs to be protected by bpf_rcu_read_lock(). The following shows a couple of examples: struct task_struct { ... struct task_struct __rcu *real_parent; struct css_set __rcu *cgroups; ... }; struct css_set { ... struct cgroup *dfl_cgrp; ... } ... task = bpf_get_current_task_btf(); cgroups = task->cgroups; dfl_cgroup = cgroups->dfl_cgrp; ... using dfl_cgroup ... The bpf_rcu_read_lock/unlock() should be added like below to avoid verification failures. task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); cgroups = task->cgroups; dfl_cgroup = cgroups->dfl_cgrp; bpf_rcu_read_unlock(); ... using dfl_cgroup ... The following is another example for task->real_parent. task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); real_parent = task->real_parent; ... bpf_task_storage_get(&map, real_parent, 0, 0); bpf_rcu_read_unlock(); There is another case observed in selftest bpf_iter_ipv6_route.c: struct fib6_info *rt = ctx->rt; ... fib6_nh = &rt->fib6_nh[0]; // Not rcu protected ... if (rt->nh) fib6_nh = &nh->nh_info->fib6_nh; // rcu protected ... ... using fib6_nh ... Currently verification will fail with same insn cannot be used with different pointers since the use of fib6_nh is tag with rcu in one path but not in the other path. The above use case is a valid one so the verifier is changed to ignore MEM_RCU type tag in such cases. Signed-off-by: Yonghong Song <yhs@xxxxxx> --- include/linux/bpf.h | 3 + include/linux/bpf_verifier.h | 1 + kernel/bpf/btf.c | 11 +++ kernel/bpf/verifier.c | 126 ++++++++++++++++++++++++++++++++--- 4 files changed, 133 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9bda4c91fc7..f0d973c8d227 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -458,6 +458,9 @@ enum bpf_type_flag { /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), + /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ + MEM_RCU = BIT(11 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1a32baa78ce2..d4e56f5a4b20 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -324,6 +324,7 @@ struct bpf_verifier_state { u32 insn_idx; u32 curframe; u32 active_spin_lock; + u32 active_rcu_lock; bool speculative; /* first and last insn idx of this verifier state */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 35c07afac924..293d224a7f53 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5527,6 +5527,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type |= MEM_USER; if (strcmp(tag_value, "percpu") == 0) info->reg_type |= MEM_PERCPU; + if (strcmp(tag_value, "rcu") == 0) + info->reg_type |= MEM_RCU; } /* skip modifiers */ @@ -5765,6 +5767,9 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, /* check __percpu tag */ if (strcmp(tag_value, "percpu") == 0) tmp_flag = MEM_PERCPU; + /* check __rcu tag */ + if (strcmp(tag_value, "rcu") == 0) + tmp_flag = MEM_RCU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); @@ -6560,6 +6565,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (sleepable && env->cur_state->active_rcu_lock) { + bpf_log(log, "kernel function %s is sleepable within rcu_read_lock region\n", + func_name); + return -EINVAL; + } + if (kfunc_meta && ref_obj_id) kfunc_meta->ref_obj_id = ref_obj_id; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82c07fe0bfb1..3c5afd3bc216 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -188,6 +188,9 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +/* Using insn->off = BPF_STORAGE_GET_CALL to mark bpf_*_storage_get() helper calls. */ +#define BPF_STORAGE_GET_CALL 1 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); @@ -511,6 +514,22 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } +static bool is_storage_get_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_storage_get || + func_id == BPF_FUNC_inode_storage_get || + func_id == BPF_FUNC_task_storage_get || + func_id == BPF_FUNC_cgrp_storage_get; +} + +static bool is_sleepable_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_copy_from_user || + func_id == BPF_FUNC_copy_from_user_task || + func_id == BPF_FUNC_ima_inode_hash || + func_id == BPF_FUNC_ima_file_hash; +} + static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -583,6 +602,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(prefix, "percpu_", 32); if (type & PTR_UNTRUSTED) strncpy(prefix, "untrusted_", 32); + if (type & MEM_RCU) + strncpy(prefix, "rcu_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -1201,6 +1222,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; dst_state->active_spin_lock = src->active_spin_lock; + dst_state->active_rcu_lock = src->active_rcu_lock; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; @@ -4536,6 +4558,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if ((reg->type & MEM_RCU) && env->prog->aux->sleepable && + !env->cur_state->active_rcu_lock) { + verbose(env, + "R%d is ptr_%s access rcu-protected memory with off=%d, not in rcu_read_lock region\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, off, size, atype, &btf_id, &flag); @@ -4552,6 +4582,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; + if ((flag & MEM_RCU) && env->prog->aux->sleepable && + !env->cur_state->active_rcu_lock) { + verbose(env, + "R%d is rcu dereference ptr_%s with off=%d, not in rcu_read_lock region\n", + regno, tname, off); + return -EACCES; + } + /* If this is an untrusted pointer, all pointers formed by walking it * also inherit the untrusted flag. */ @@ -5684,7 +5722,12 @@ static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; -static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; +static const struct bpf_reg_types btf_ptr_types = { + .types = { + PTR_TO_BTF_ID, + PTR_TO_BTF_ID | MEM_RCU, + } +}; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; @@ -5758,6 +5801,20 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + /* If the reg type is marked as MEM_RCU, ensure the usage is in the rcu_read_lock + * region, and remove MEM_RCU from the type since the arg_type won't encode + * MEM_RCU. + */ + if (type & MEM_RCU) { + if (env->prog->aux->sleepable && !env->cur_state->active_rcu_lock) { + verbose(env, + "R%d is arg type %s needs rcu protection\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + type &= ~MEM_RCU; + } + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -5774,7 +5831,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EACCES; found: - if (reg->type == PTR_TO_BTF_ID) { + /* reg is already protected by rcu_read_lock(). Peel off MEM_RCU from reg->type. */ + if ((reg->type & ~MEM_RCU) == PTR_TO_BTF_ID) { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types. @@ -5850,6 +5908,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, * fixed offset. */ case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | MEM_RCU: /* When referenced PTR_TO_BTF_ID is passed to release function, * it's fixed offset must be 0. In the other cases, fixed offset * can be non-zero. @@ -7289,6 +7348,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } meta.func_id = func_id; + + if (func_id == BPF_FUNC_rcu_read_lock) + env->cur_state->active_rcu_lock++; + if (func_id == BPF_FUNC_rcu_read_unlock) { + if (env->cur_state->active_rcu_lock == 0) { + verbose(env, "missing bpf_rcu_read_lock\n"); + return -EINVAL; + } + + env->cur_state->active_rcu_lock--; + } + if (env->cur_state->active_rcu_lock) { + if (is_sleepable_function(func_id)) + verbose(env, "sleepable helper %s#%din rcu_read_lock region\n", + func_id_name(func_id), func_id); + + if (env->prog->aux->sleepable && is_storage_get_function(func_id)) + insn->off = BPF_STORAGE_GET_CALL; + } + /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { err = check_func_arg(env, i, &meta, fn); @@ -10470,6 +10549,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->prog->aux->sleepable && env->cur_state->active_rcu_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -11734,6 +11818,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_spin_lock != cur->active_spin_lock) return false; + if (old->active_rcu_lock != cur->active_rcu_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -12141,6 +12228,11 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) !reg_type_mismatch_ok(prev)); } +static bool reg_type_mismatch_ignore_rcu(enum bpf_reg_type src, enum bpf_reg_type prev) +{ + return reg_type_mismatch(src & ~MEM_RCU, prev & ~MEM_RCU); +} + static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); @@ -12266,6 +12358,18 @@ static int do_check(struct bpf_verifier_env *env) prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; + /* For NOT_INIT *prev_src_type, ignore rcu type tag. + * For code like below, + * struct foo *f; + * if (...) + * f = ...; // f with MEM_RCU type tag. + * else + * f = ...; // f without MEM_RCU type tag. + * ... f ... // Here f could be with/without MEM_RCU + * + * It is safe to ignore MEM_RCU type tag here since + * base types are the same. + */ if (*prev_src_type == NOT_INIT) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) @@ -12273,7 +12377,7 @@ static int do_check(struct bpf_verifier_env *env) */ *prev_src_type = src_reg_type; - } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { + } else if (reg_type_mismatch_ignore_rcu(src_reg_type, *prev_src_type)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -12412,6 +12516,11 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (env->cur_state->active_rcu_lock) { + verbose(env, "bpf_rcu_read_unlock is missing\n"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback @@ -13499,6 +13608,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | @@ -14148,11 +14258,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_task_storage_get || - insn->imm == BPF_FUNC_sk_storage_get || - insn->imm == BPF_FUNC_inode_storage_get || - insn->imm == BPF_FUNC_cgrp_storage_get) { - if (env->prog->aux->sleepable) + if (is_storage_get_function(insn->imm)) { + if (env->prog->aux->sleepable && insn->off) { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + insn->off = 0; + } else if (env->prog->aux->sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); else insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); -- 2.30.2