On Thu, Apr 21, 2022 at 09:45:28AM IST, Alexei Starovoitov wrote: > On Fri, Apr 15, 2022 at 09:33:44PM +0530, Kumar Kartikeya Dwivedi wrote: > > +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, > > + const struct btf_type *t) > > +{ > > + struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; > > + struct bpf_map_value_off *tab; > > + int ret, i, nr_off; > > + > > + /* Revisit stack usage when bumping BPF_MAP_VALUE_OFF_MAX */ > > + BUILD_BUG_ON(BPF_MAP_VALUE_OFF_MAX != 8); > > Pls drop this line and comment. It's establishing a false sense of safety > that this is the place to worry about when enum is changing. > Any time an enum or #define is changed all code that uses it has to be audited. > This stack increase is a minor concern comparing to all other side effects > the increase of BPF_MAP_VALUE_OFF_MAX would do. > Ok. > > + > > + ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); > > + if (ret < 0) > > + return ERR_PTR(ret); > > + if (!ret) > > + return NULL; > > + > > + nr_off = ret; > > + tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); > > + if (!tab) > > + return ERR_PTR(-ENOMEM); > > + > > + for (i = 0; i < nr_off; i++) { > > + const struct btf_type *t; > > + struct btf *off_btf; > > off_btf is an odd name here. Call it kernel_btf ? > Ok, will rename. > > + s32 id; > > + > > + t = btf_type_by_id(btf, info_arr[i].type_id); > > pls add a comment here to make it clear that above 'btf' is a prog's btf > and below search is trying to find it in kernel or module btf-s. > Ok. > > + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), > > + &off_btf); > > + if (id < 0) { > > + ret = id; > > + goto end; > > + } > > + > > + tab->off[i].offset = info_arr[i].off; > > + tab->off[i].kptr.btf_id = id; > > + tab->off[i].kptr.btf = off_btf; > > + } > > + tab->nr_off = nr_off; > > + return tab; > > +end: > > + while (i--) > > + btf_put(tab->off[i].kptr.btf); > > + kfree(tab); > > + return ERR_PTR(ret); > > +} > > + > > static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, > > u32 type_id, void *data, u8 bits_offset, > > struct btf_show *show) > > diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c > > index 5cd8f5277279..135205d0d560 100644 > > --- a/kernel/bpf/map_in_map.c > > +++ b/kernel/bpf/map_in_map.c > > @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) > > inner_map_meta->max_entries = inner_map->max_entries; > > inner_map_meta->spin_lock_off = inner_map->spin_lock_off; > > inner_map_meta->timer_off = inner_map->timer_off; > > + inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); > > if (inner_map->btf) { > > btf_get(inner_map->btf); > > inner_map_meta->btf = inner_map->btf; > > @@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) > > > > void bpf_map_meta_free(struct bpf_map *map_meta) > > { > > + bpf_map_free_kptr_off_tab(map_meta); > > btf_put(map_meta->btf); > > kfree(map_meta); > > } > > @@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, > > meta0->key_size == meta1->key_size && > > meta0->value_size == meta1->value_size && > > meta0->timer_off == meta1->timer_off && > > - meta0->map_flags == meta1->map_flags; > > + meta0->map_flags == meta1->map_flags && > > + bpf_map_equal_kptr_off_tab(meta0, meta1); > > } > > > > void *bpf_map_fd_get_ptr(struct bpf_map *map, > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index e9621cfa09f2..fba49f390ed5 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -6,6 +6,7 @@ > > #include <linux/bpf_trace.h> > > #include <linux/bpf_lirc.h> > > #include <linux/bpf_verifier.h> > > +#include <linux/bsearch.h> > > #include <linux/btf.h> > > #include <linux/syscalls.h> > > #include <linux/slab.h> > > @@ -473,12 +474,94 @@ static void bpf_map_release_memcg(struct bpf_map *map) > > } > > #endif > > > > +static int bpf_map_kptr_off_cmp(const void *a, const void *b) > > +{ > > + const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; > > + > > + if (off_desc1->offset < off_desc2->offset) > > + return -1; > > + else if (off_desc1->offset > off_desc2->offset) > > + return 1; > > + return 0; > > +} > > + > > +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) > > +{ > > + /* Since members are iterated in btf_find_field in increasing order, > > + * offsets appended to kptr_off_tab are in increasing order, so we can > > + * do bsearch to find exact match. > > + */ > > + struct bpf_map_value_off *tab; > > + > > + if (!map_value_has_kptrs(map)) > > + return NULL; > > + tab = map->kptr_off_tab; > > + return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); > > +} > > + > > +void bpf_map_free_kptr_off_tab(struct bpf_map *map) > > +{ > > + struct bpf_map_value_off *tab = map->kptr_off_tab; > > + int i; > > + > > + if (!map_value_has_kptrs(map)) > > + return; > > + for (i = 0; i < tab->nr_off; i++) { > > + struct btf *btf = tab->off[i].kptr.btf; > > + > > + btf_put(btf); > > why not to do: btf_put(tab->off[i].kptr.btf); > Ok. > > + } > > + kfree(tab); > > + map->kptr_off_tab = NULL; > > +} > > + > > +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) > > +{ > > + struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; > > + int size, i, ret; > > + > > + if (!map_value_has_kptrs(map)) > > + return ERR_PTR(-ENOENT); > > + /* Do a deep copy of the kptr_off_tab */ > > + for (i = 0; i < tab->nr_off; i++) > > + btf_get(tab->off[i].kptr.btf); > > + > > + size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); > > + new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); > > + if (!new_tab) { > > + ret = -ENOMEM; > > + goto end; > > + } > > + return new_tab; > > +end: > > + while (i--) > > + btf_put(tab->off[i].kptr.btf); > > Why do this get/put dance? > Isn't it equivalent to do kmemdup first and then for() btf_get? > kptr_off_tab is not going away and btfs are not going away either. > There is no race. > You are right, we should be able to just do kmemdup + btf_get loop. > > + return ERR_PTR(ret); > > +} > > + > > +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) > > +{ > > + struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; > > + bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); > > + int size; > > + > > + if (!a_has_kptr && !b_has_kptr) > > + return true; > > + if (a_has_kptr != b_has_kptr) > > + return false; > > + if (tab_a->nr_off != tab_b->nr_off) > > + return false; > > + size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); > > + return !memcmp(tab_a, tab_b, size); > > +} > > + > > /* called from workqueue */ > > static void bpf_map_free_deferred(struct work_struct *work) > > { > > struct bpf_map *map = container_of(work, struct bpf_map, work); > > > > security_bpf_map_free(map); > > + bpf_map_free_kptr_off_tab(map); > > bpf_map_release_memcg(map); > > /* implementation dependent freeing */ > > map->ops->map_free(map); > > @@ -640,7 +723,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) > > int err; > > > > if (!map->ops->map_mmap || map_value_has_spin_lock(map) || > > - map_value_has_timer(map)) > > + map_value_has_timer(map) || map_value_has_kptrs(map)) > > return -ENOTSUPP; > > > > if (!(vma->vm_flags & VM_SHARED)) > > @@ -820,9 +903,33 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, > > return -EOPNOTSUPP; > > } > > > > - if (map->ops->map_check_btf) > > + map->kptr_off_tab = btf_parse_kptrs(btf, value_type); > > + if (map_value_has_kptrs(map)) { > > + if (!bpf_capable()) { > > + ret = -EPERM; > > + goto free_map_tab; > > + } > > + if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { > > + ret = -EACCES; > > + goto free_map_tab; > > + } > > + if (map->map_type != BPF_MAP_TYPE_HASH && > > + map->map_type != BPF_MAP_TYPE_LRU_HASH && > > + map->map_type != BPF_MAP_TYPE_ARRAY) { > > + ret = -EOPNOTSUPP; > > + goto free_map_tab; > > + } > > + } > > + > > + if (map->ops->map_check_btf) { > > ret = map->ops->map_check_btf(map, btf, key_type, value_type); > > + if (ret < 0) > > + goto free_map_tab; > > + } > > > > + return ret; > > +free_map_tab: > > + bpf_map_free_kptr_off_tab(map); > > return ret; > > } > > > > @@ -1639,7 +1746,7 @@ static int map_freeze(const union bpf_attr *attr) > > return PTR_ERR(map); > > > > if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || > > - map_value_has_timer(map)) { > > + map_value_has_timer(map) || map_value_has_kptrs(map)) { > > fdput(f); > > return -ENOTSUPP; > > } > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > > index 71827d14724a..c802e51c4e18 100644 > > --- a/kernel/bpf/verifier.c > > +++ b/kernel/bpf/verifier.c > > @@ -3211,7 +3211,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, > > return 0; > > } > > > > -enum stack_access_src { > > +enum bpf_access_src { > > ACCESS_DIRECT = 1, /* the access is performed by an instruction */ > > ACCESS_HELPER = 2, /* the access is performed by a helper */ > > }; > > @@ -3219,7 +3219,7 @@ enum stack_access_src { > > static int check_stack_range_initialized(struct bpf_verifier_env *env, > > int regno, int off, int access_size, > > bool zero_size_allowed, > > - enum stack_access_src type, > > + enum bpf_access_src type, > > struct bpf_call_arg_meta *meta); > > > > static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) > > @@ -3507,9 +3507,87 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, > > return __check_ptr_off_reg(env, reg, regno, false); > > } > > > > +static int map_kptr_match_type(struct bpf_verifier_env *env, > > + struct bpf_map_value_off_desc *off_desc, > > + struct bpf_reg_state *reg, u32 regno) > > +{ > > + const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); > > + const char *reg_name = ""; > > + > > + if (base_type(reg->type) != PTR_TO_BTF_ID || type_flag(reg->type) != PTR_MAYBE_NULL) > > + goto bad_type; > > + > > + if (!btf_is_kernel(reg->btf)) { > > + verbose(env, "R%d must point to kernel BTF\n", regno); > > + return -EINVAL; > > + } > > + /* We need to verify reg->type and reg->btf, before accessing reg->btf */ > > + reg_name = kernel_type_name(reg->btf, reg->btf_id); > > + > > + if (__check_ptr_off_reg(env, reg, regno, true)) > > + return -EACCES; > > + > > + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, > > + off_desc->kptr.btf, off_desc->kptr.btf_id)) > > + goto bad_type; > > Is full type comparison really needed? Yes. > reg->btf should be the same pointer as off_desc->kptr.btf > and btf_id should match exactly. This is not true, it can be vmlinux or module BTF. But if you mean just comparing the pointer and btf_id, we still need to handle reg->off. We want to support cases like: struct foo { struct bar br; struct baz bz; }; struct foo *v = func(); // PTR_TO_BTF_ID map->foo = v; // reg->off is zero, btf and btf_id matches type. map->bar = &v->br; // reg->off is still zero, but we need to walk and retry with // first member type of struct after comparison fails. map->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked to // match type. In the ref case, the argument's offset will always be 0, so third case is not going to work, but in the unref case, we want to allow storing pointers to structs embedded inside parent struct. Please let me know if I misunderstood what you meant. > Is this a feature proofing for some day when registers with PTR_TO_BTF_ID type > will start pointing to prog's btf? > > > + return 0; > > +bad_type: > > + verbose(env, "invalid kptr access, R%d type=%s%s ", regno, > > + reg_type_str(env, reg->type), reg_name); > > + verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name); > > + return -EINVAL; > > +} > > + > > +static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, > > + int value_regno, int insn_idx, > > + struct bpf_map_value_off_desc *off_desc) > > +{ > > + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; > > + int class = BPF_CLASS(insn->code); > > + struct bpf_reg_state *val_reg; > > + > > + /* Things we already checked for in check_map_access and caller: > > + * - Reject cases where variable offset may touch kptr > > + * - size of access (must be BPF_DW) > > + * - tnum_is_const(reg->var_off) > > + * - off_desc->offset == off + reg->var_off.value > > + */ > > + /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ > > + if (BPF_MODE(insn->code) != BPF_MEM) { > > + verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n"); > > + return -EACCES; > > + } > > + > > + if (class == BPF_LDX) { > > + val_reg = reg_state(env, value_regno); > > + /* We can simply mark the value_regno receiving the pointer > > + * value from map as PTR_TO_BTF_ID, with the correct type. > > + */ > > + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, > > + off_desc->kptr.btf_id, PTR_MAYBE_NULL); > > + val_reg->id = ++env->id_gen; > > why non zero id this needed? For mark_ptr_or_null_reg. I'll add a comment. -- Kartikeya