On Tue, Nov 15, 2022 at 12:45:38AM +0530, Kumar Kartikeya Dwivedi wrote: > Introduce type safe memory allocator bpf_obj_new for BPF programs. The > kernel side kfunc is named bpf_obj_new_impl, as passing hidden arguments > to kfuncs still requires having them in prototype, unlike BPF helpers > which always take 5 arguments and have them checked using bpf_func_proto > in verifier, ignoring unset argument types. > > Introduce __ign suffix to ignore a specific kfunc argument during type > checks, then use this to introduce support for passing type metadata to > the bpf_obj_new_impl kfunc. > > The user passes BTF ID of the type it wants to allocates in program BTF, > the verifier then rewrites the first argument as the size of this type, > after performing some sanity checks (to ensure it exists and it is a > struct type). > > The second argument is also fixed up and passed by the verifier. This is > the btf_struct_meta for the type being allocated. It would be needed > mostly for the offset array which is required for zero initializing > special fields while leaving the rest of storage in unitialized state. > > It would also be needed in the next patch to perform proper destruction > of the object's special fields. > > Under the hood, bpf_obj_new will call bpf_mem_alloc and bpf_mem_free, > using the any context BPF memory allocator introduced recently. To this > end, a global instance of the BPF memory allocator is initialized on > boot to be used for this purpose. This 'bpf_global_ma' serves all > allocations for bpf_obj_new. In the future, bpf_obj_new variants will > allow specifying a custom allocator. > > Note that now that bpf_obj_new can be used to allocate objects that can > be linked to BPF linked list (when future linked list helpers are > available), we need to also free the elements using bpf_mem_free. > However, since the draining of elements is done outside the > bpf_spin_lock, we need to do migrate_disable around the call since > bpf_list_head_free can be called from map free path where migration is > enabled. Otherwise, when called from BPF programs migration is already > disabled. > > A convenience macro is included in the bpf_experimental.h header to hide > over the ugly details of the implementation, leading to user code > looking similar to a language level extension which allocates and > constructs fields of a user type. > > struct bar { > struct bpf_list_node node; > }; > > struct foo { > struct bpf_spin_lock lock; > struct bpf_list_head head __contains(bar, node); > }; > > void prog(void) { > struct foo *f; > > f = bpf_obj_new(typeof(*f)); > if (!f) > return; > ... > } > > A key piece of this story is still missing, i.e. the free function, > which will come in the next patch. > > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> > --- > include/linux/bpf.h | 21 ++-- > include/linux/bpf_verifier.h | 2 + > kernel/bpf/core.c | 16 +++ > kernel/bpf/helpers.c | 47 ++++++-- > kernel/bpf/verifier.c | 107 ++++++++++++++++-- > .../testing/selftests/bpf/bpf_experimental.h | 25 ++++ > 6 files changed, 195 insertions(+), 23 deletions(-) > create mode 100644 tools/testing/selftests/bpf/bpf_experimental.h > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 62a16b699e71..4635e31bd6fc 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -54,6 +54,8 @@ struct cgroup; > extern struct idr btf_idr; > extern spinlock_t btf_idr_lock; > extern struct kobject *btf_kobj; > +extern struct bpf_mem_alloc bpf_global_ma; > +extern bool bpf_global_ma_set; > > typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); > typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, > @@ -333,16 +335,19 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f > return rec->field_mask & type; > } > > -static inline void check_and_init_map_value(struct bpf_map *map, void *dst) > +static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) > { > - if (!IS_ERR_OR_NULL(map->record)) { > - struct btf_field *fields = map->record->fields; > - u32 cnt = map->record->cnt; > - int i; > + int i; > > - for (i = 0; i < cnt; i++) > - memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); > - } > + if (!foffs) > + return; > + for (i = 0; i < foffs->cnt; i++) > + memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); > +} > + > +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) > +{ > + bpf_obj_init(map->field_offs, dst); > } > > /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > index 887fa4d922f6..306fc1d6cc4a 100644 > --- a/include/linux/bpf_verifier.h > +++ b/include/linux/bpf_verifier.h > @@ -427,6 +427,8 @@ struct bpf_insn_aux_data { > */ > struct bpf_loop_inline_state loop_inline_state; > }; > + u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ > + struct btf_struct_meta *kptr_struct_meta; > u64 map_key_state; /* constant (32 bit) key tracking for maps */ > int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ > u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 9c16338bcbe8..2e57fc839a5c 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -34,6 +34,7 @@ > #include <linux/log2.h> > #include <linux/bpf_verifier.h> > #include <linux/nodemask.h> > +#include <linux/bpf_mem_alloc.h> > > #include <asm/barrier.h> > #include <asm/unaligned.h> > @@ -60,6 +61,9 @@ > #define CTX regs[BPF_REG_CTX] > #define IMM insn->imm > > +struct bpf_mem_alloc bpf_global_ma; > +bool bpf_global_ma_set; > + > /* No hurry in this branch > * > * Exported for the bpf jit load helper. > @@ -2746,6 +2750,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len) > return -ENOTSUPP; > } > > +#ifdef CONFIG_BPF_SYSCALL > +static int __init bpf_global_ma_init(void) > +{ > + int ret; > + > + ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); > + bpf_global_ma_set = !ret; > + return ret; > +} > +late_initcall(bpf_global_ma_init); > +#endif > + > DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); > EXPORT_SYMBOL(bpf_stats_enabled_key); > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 5bc0b9f0f306..c4f1c22cc44c 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -19,6 +19,7 @@ > #include <linux/proc_ns.h> > #include <linux/security.h> > #include <linux/btf_ids.h> > +#include <linux/bpf_mem_alloc.h> > > #include "../../lib/kstrtox.h" > > @@ -1735,25 +1736,57 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, > > obj -= field->list_head.node_offset; > head = head->next; > - /* TODO: Rework later */ > - kfree(obj); > + /* The contained type can also have resources, including a > + * bpf_list_head which needs to be freed. > + */ > + bpf_obj_free_fields(field->list_head.value_rec, obj); > + /* bpf_mem_free requires migrate_disable(), since we can be > + * called from map free path as well apart from BPF program (as > + * part of map ops doing bpf_obj_free_fields). > + */ > + migrate_disable(); > + bpf_mem_free(&bpf_global_ma, obj); > + migrate_enable(); > } > } > > -BTF_SET8_START(tracing_btf_ids) > +__diag_push(); > +__diag_ignore_all("-Wmissing-prototypes", > + "Global functions as their definitions will be in vmlinux BTF"); > + > +void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) > +{ > + struct btf_struct_meta *meta = meta__ign; > + u64 size = local_type_id__k; > + void *p; > + > + if (unlikely(!bpf_global_ma_set)) > + return NULL; > + p = bpf_mem_alloc(&bpf_global_ma, size); > + if (!p) > + return NULL; > + if (meta) > + bpf_obj_init(meta->field_offs, p); > + return p; > +} > + > +__diag_pop(); > + > +BTF_SET8_START(generic_btf_ids) > #ifdef CONFIG_KEXEC_CORE > BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) > #endif > -BTF_SET8_END(tracing_btf_ids) > +BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) > +BTF_SET8_END(generic_btf_ids) > > -static const struct btf_kfunc_id_set tracing_kfunc_set = { > +static const struct btf_kfunc_id_set generic_kfunc_set = { > .owner = THIS_MODULE, > - .set = &tracing_btf_ids, > + .set = &generic_btf_ids, > }; > > static int __init kfunc_init(void) > { > - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set); > + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); > } > > late_initcall(kfunc_init); > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index a4a1424b19a5..c7f5d83783db 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -7948,6 +7948,11 @@ static bool is_kfunc_arg_sfx_constant(const struct btf *btf, const struct btf_pa > return __kfunc_param_match_suffix(btf, arg, "__k"); > } > > +static bool is_kfunc_arg_sfx_ignore(const struct btf *btf, const struct btf_param *arg) > +{ > + return __kfunc_param_match_suffix(btf, arg, "__ign"); > +} > + > static bool is_kfunc_arg_ret_buf_size(const struct btf *btf, > const struct btf_param *arg, > const struct bpf_reg_state *reg, > @@ -8216,6 +8221,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ > int kf_arg_type; > > t = btf_type_skip_modifiers(btf, args[i].type, NULL); > + > + if (is_kfunc_arg_sfx_ignore(btf, &args[i])) > + continue; > + > if (btf_type_is_scalar(t)) { > if (reg->type != SCALAR_VALUE) { > verbose(env, "R%d is not a scalar\n", regno); > @@ -8395,6 +8404,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ > return 0; > } > > +enum special_kfunc_type { > + KF_bpf_obj_new_impl, > +}; > + > +BTF_SET_START(special_kfunc_set) > +BTF_ID(func, bpf_obj_new_impl) > +BTF_SET_END(special_kfunc_set) > + > +BTF_ID_LIST(special_kfunc_list) > +BTF_ID(func, bpf_obj_new_impl) > + > static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > int *insn_idx_p) > { > @@ -8469,17 +8489,64 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); > > if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { > - verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); > - return -EINVAL; > + /* Only exception is bpf_obj_new_impl */ > + if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) { > + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); > + return -EINVAL; > + } > } > > if (btf_type_is_scalar(t)) { > mark_reg_unknown(env, regs, BPF_REG_0); > mark_btf_func_reg_size(env, BPF_REG_0, t->size); > } else if (btf_type_is_ptr(t)) { > - ptr_type = btf_type_skip_modifiers(desc_btf, t->type, > - &ptr_type_id); > - if (!btf_type_is_struct(ptr_type)) { > + ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); > + > + if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { > + if (!btf_type_is_void(ptr_type)) { > + verbose(env, "kernel function %s must have void * return type\n", > + meta.func_name); > + return -EINVAL; > + } Here you're checking that void *bpf_obj_new_impl and obj_drop actually were declared with 'void *' return ? and wasting run-time cycle to check that?? Please don't. Even if we miss that during code review there is no safety issue. > + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { > + const struct btf_type *ret_t; > + struct btf *ret_btf; > + u32 ret_btf_id; > + > + if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { > + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); > + return -EINVAL; > + } > + > + ret_btf = env->prog->aux->btf; > + ret_btf_id = meta.arg_constant.value; > + > + /* This may be NULL due to user not supplying a BTF */ > + if (!ret_btf) { > + verbose(env, "bpf_obj_new requires prog BTF\n"); > + return -EINVAL; > + } > + > + ret_t = btf_type_by_id(ret_btf, ret_btf_id); > + if (!ret_t || !__btf_type_is_struct(ret_t)) { > + verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); > + return -EINVAL; > + } > + > + mark_reg_known_zero(env, regs, BPF_REG_0); > + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; > + regs[BPF_REG_0].btf = ret_btf; > + regs[BPF_REG_0].btf_id = ret_btf_id; > + > + env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; > + env->insn_aux_data[insn_idx].kptr_struct_meta = > + btf_find_struct_meta(ret_btf, ret_btf_id); > + } else { > + verbose(env, "kernel function %s unhandled dynamic return type\n", > + meta.func_name); > + return -EFAULT; > + } > + } else if (!__btf_type_is_struct(ptr_type)) { > if (!meta.r0_size) { > ptr_type_name = btf_name_by_offset(desc_btf, > ptr_type->name_off); > @@ -8507,6 +8574,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > regs[BPF_REG_0].type = PTR_TO_BTF_ID; > regs[BPF_REG_0].btf_id = ptr_type_id; > } > + > if (is_kfunc_ret_null(&meta)) { > regs[BPF_REG_0].type |= PTR_MAYBE_NULL; > /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ > @@ -14644,8 +14712,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) > return err; > } > > -static int fixup_kfunc_call(struct bpf_verifier_env *env, > - struct bpf_insn *insn) > +static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > + struct bpf_insn *insn_buf, int insn_idx, int *cnt) > { > const struct bpf_kfunc_desc *desc; > > @@ -14664,8 +14732,21 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, > return -EFAULT; > } > > + *cnt = 0; > insn->imm = desc->imm; > + if (insn->off) > + return 0; > + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { > + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; > + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; > + u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; > > + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); > + insn_buf[1] = addr[0]; > + insn_buf[2] = addr[1]; > + insn_buf[3] = *insn; > + *cnt = 4; > + } > return 0; > } > > @@ -14807,9 +14888,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env) > if (insn->src_reg == BPF_PSEUDO_CALL) > continue; > if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { > - ret = fixup_kfunc_call(env, insn); > + ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); > if (ret) > return ret; > + if (cnt == 0) > + continue; > + > + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); > + if (!new_prog) > + return -ENOMEM; > + > + delta += cnt - 1; > + env->prog = prog = new_prog; > + insn = new_prog->insnsi + i + delta; > continue; > } > > diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h > new file mode 100644 > index 000000000000..aeb6a7fcb7c4 > --- /dev/null > +++ b/tools/testing/selftests/bpf/bpf_experimental.h > @@ -0,0 +1,25 @@ > +#ifndef __BPF_EXPERIMENTAL__ > +#define __BPF_EXPERIMENTAL__ > + > +#include <vmlinux.h> > +#include <bpf/bpf_tracing.h> > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_core_read.h> > + > +/* Description > + * Allocates an object of the type represented by 'local_type_id' in > + * program BTF. User may use the bpf_core_type_id_local macro to pass the > + * type ID of a struct in program BTF. > + * > + * The 'local_type_id' parameter must be a known constant. > + * The 'meta' parameter is a hidden argument that is ignored. > + * Returns > + * A pointer to an object of the type corresponding to the passed in > + * 'local_type_id', or NULL on failure. > + */ > +extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; > + > +/* Convenience macro to wrap over bpf_obj_new_impl */ > +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) > + > +#endif > -- > 2.38.1 >