From: Alexei Starovoitov <ast@xxxxxxxxxx> The life time of certain kernel structures like 'struct cgroup' is protected by RCU. Hence it's safe to dereference them directly from kptr-s in bpf maps. The resulting pointer is PTR_TRUSTED and can be passed to kfuncs that expect KF_TRUSTED_ARGS. Derefrence of other kptr-s returns PTR_UNTRUSTED. For example: struct map_value { struct cgroup __kptr_rcu *cgrp; }; SEC("tp_btf/cgroup_mkdir") int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path) { struct cgroup *cg, *cg2; cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0 bpf_kptr_xchg(&v->cgrp, cg); cg2 = v->cgrp; // cg2 is PTR_TRUSTED | MEM_RCU. This is new feature introduced by this patch. bpf_cgroup_ancestor(cg2, level); // safe to do. cg2 will not disappear } Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx> --- include/linux/bpf.h | 15 ++++++++++----- include/linux/bpf_verifier.h | 2 +- kernel/bpf/btf.c | 22 +++++++++++++++++++++- kernel/bpf/syscall.c | 4 ++++ kernel/bpf/verifier.c | 33 ++++++++++++++++++++++----------- tools/lib/bpf/bpf_helpers.h | 1 + 6 files changed, 59 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be34f7deb6c3..c9cc11653b83 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -178,11 +178,12 @@ enum btf_field_type { BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), - BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, - BPF_LIST_HEAD = (1 << 4), - BPF_LIST_NODE = (1 << 5), - BPF_RB_ROOT = (1 << 6), - BPF_RB_NODE = (1 << 7), + BPF_KPTR_RCU = (1 << 4), + BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_RCU, + BPF_LIST_HEAD = (1 << 5), + BPF_LIST_NODE = (1 << 6), + BPF_RB_ROOT = (1 << 7), + BPF_RB_NODE = (1 << 8), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, }; @@ -284,6 +285,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; + case BPF_KPTR_RCU: + return "kptr_rcu"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: @@ -307,6 +310,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); @@ -331,6 +335,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cf1bb1cf4a7b..e7c01294b3d3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -682,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | MEM_RCU) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1bba71c6176a..9b0342220a2d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3287,6 +3287,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_UNREF; else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; + else if (!strcmp("kptr_rcu", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_RCU; else return -EINVAL; @@ -3449,6 +3451,7 @@ static int btf_find_struct_field(const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3514,6 +3517,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3552,6 +3556,18 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, return -EINVAL; } +BTF_SET_START(rcu_protected_types) +BTF_ID(struct, prog_test_ref_kfunc) +BTF_ID(struct, cgroup) +BTF_SET_END(rcu_protected_types) + +static bool rcu_protected_object(const struct btf *btf, u32 btf_id) +{ + if (!btf_is_kernel(btf)) + return false; + return btf_id_set_contains(&rcu_protected_types, btf_id); +} + static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { @@ -3570,10 +3586,13 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, if (id < 0) return id; + if (info->type == BPF_KPTR_RCU && !rcu_protected_object(kernel_btf, id)) + return -EINVAL; + /* Find and stash the function pointer for the destruction function that * needs to be eventually invoked from the map free path. */ - if (info->type == BPF_KPTR_REF) { + if (info->type == BPF_KPTR_REF || info->type == BPF_KPTR_RCU) { const struct btf_type *dtor_func; const char *dtor_func_name; unsigned long addr; @@ -3737,6 +3756,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3fcdc9836a6..2e730918911c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -539,6 +539,7 @@ void btf_record_free(struct btf_record *rec) switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); @@ -584,6 +585,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; @@ -669,6 +671,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: + case BPF_KPTR_RCU: field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); break; case BPF_LIST_HEAD: @@ -1058,6 +1061,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 21e08c111702..b23d6e3a9699 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4159,7 +4159,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; + int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ @@ -4206,12 +4206,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off * is zero. We must also ensure that btf_struct_ids_match does not walk * the struct to match type against first member of struct, i.e. reject - * second case from above. Hence, when type is BPF_KPTR_REF, we set + * second case from above. Hence, when type is BPF_KPTR_REF | BPF_KPTR_RCU, we set * strict mode to true for type match. */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, kptr_field->kptr.btf, kptr_field->kptr.btf_id, - kptr_field->type == BPF_KPTR_REF)) + kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_RCU)) goto bad_type; return 0; bad_type: @@ -4226,6 +4226,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, return -EINVAL; } +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() + * can dereference RCU protected pointers and result is PTR_TRUSTED. + */ +static bool in_rcu_cs(struct bpf_verifier_env *env) +{ + return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable; +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -4249,7 +4257,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { + if (class != BPF_LDX && kptr_field->type != BPF_KPTR_UNREF) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -4260,7 +4268,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, + kptr_field->type == BPF_KPTR_RCU && in_rcu_cs(env) ? + PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU : + PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -4314,6 +4325,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_RCU: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -5110,11 +5122,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since * it could be null in some cases. */ - if (!env->cur_state->active_rcu_lock || - !(is_trusted_reg(reg) || is_rcu_reg(reg))) - flag &= ~MEM_RCU; - else + if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg))) flag |= PTR_MAYBE_NULL; + else + flag &= ~MEM_RCU; } else if (reg->type & MEM_RCU) { /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. @@ -6157,7 +6168,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (kptr_field->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_RCU) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } @@ -9069,7 +9080,7 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, } kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); - if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { + if (!kptr_field || (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_RCU)) { verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n", reg->off + reg->var_off.value); return -EINVAL; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 7d12d3e620cc..affc0997f937 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -176,6 +176,7 @@ enum libbpf_tristate { #define __ksym __attribute__((section(".ksyms"))) #define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) #define __kptr __attribute__((btf_type_tag("kptr"))) +#define __kptr_rcu __attribute__((btf_type_tag("kptr_rcu"))) #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b -- 2.30.2