Re: [PATCH bpf-next v5 03/13] bpf: Allow storing unreferenced kptr in map

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Wed, 20 Apr 2022 21:15:28 -0700

On Fri, Apr 15, 2022 at 09:33:44PM +0530, Kumar Kartikeya Dwivedi wrote:
> +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
> +					  const struct btf_type *t)
> +{
> +	struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
> +	struct bpf_map_value_off *tab;
> +	int ret, i, nr_off;
> +
> +	/* Revisit stack usage when bumping BPF_MAP_VALUE_OFF_MAX */
> +	BUILD_BUG_ON(BPF_MAP_VALUE_OFF_MAX != 8);

Pls drop this line and comment. It's establishing a false sense of safety
that this is the place to worry about when enum is changing.
Any time an enum or #define is changed all code that uses it has to be audited.
This stack increase is a minor concern comparing to all other side effects
the increase of BPF_MAP_VALUE_OFF_MAX would do.

> +
> +	ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +	if (!ret)
> +		return NULL;
> +
> +	nr_off = ret;
> +	tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
> +	if (!tab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	for (i = 0; i < nr_off; i++) {
> +		const struct btf_type *t;
> +		struct btf *off_btf;

off_btf is an odd name here. Call it kernel_btf ?

> +		s32 id;
> +
> +		t = btf_type_by_id(btf, info_arr[i].type_id);

pls add a comment here to make it clear that above 'btf' is a prog's btf
and below search is trying to find it in kernel or module btf-s.

> +		id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
> +				     &off_btf);
> +		if (id < 0) {
> +			ret = id;
> +			goto end;
> +		}
> +
> +		tab->off[i].offset = info_arr[i].off;
> +		tab->off[i].kptr.btf_id = id;
> +		tab->off[i].kptr.btf = off_btf;
> +	}
> +	tab->nr_off = nr_off;
> +	return tab;
> +end:
> +	while (i--)
> +		btf_put(tab->off[i].kptr.btf);
> +	kfree(tab);
> +	return ERR_PTR(ret);
> +}
> +
>  static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
>  			      u32 type_id, void *data, u8 bits_offset,
>  			      struct btf_show *show)
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index 5cd8f5277279..135205d0d560 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>  	inner_map_meta->max_entries = inner_map->max_entries;
>  	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
>  	inner_map_meta->timer_off = inner_map->timer_off;
> +	inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
>  	if (inner_map->btf) {
>  		btf_get(inner_map->btf);
>  		inner_map_meta->btf = inner_map->btf;
> @@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>  
>  void bpf_map_meta_free(struct bpf_map *map_meta)
>  {
> +	bpf_map_free_kptr_off_tab(map_meta);
>  	btf_put(map_meta->btf);
>  	kfree(map_meta);
>  }
> @@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
>  		meta0->key_size == meta1->key_size &&
>  		meta0->value_size == meta1->value_size &&
>  		meta0->timer_off == meta1->timer_off &&
> -		meta0->map_flags == meta1->map_flags;
> +		meta0->map_flags == meta1->map_flags &&
> +		bpf_map_equal_kptr_off_tab(meta0, meta1);
>  }
>  
>  void *bpf_map_fd_get_ptr(struct bpf_map *map,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e9621cfa09f2..fba49f390ed5 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -6,6 +6,7 @@
>  #include <linux/bpf_trace.h>
>  #include <linux/bpf_lirc.h>
>  #include <linux/bpf_verifier.h>
> +#include <linux/bsearch.h>
>  #include <linux/btf.h>
>  #include <linux/syscalls.h>
>  #include <linux/slab.h>
> @@ -473,12 +474,94 @@ static void bpf_map_release_memcg(struct bpf_map *map)
>  }
>  #endif
>  
> +static int bpf_map_kptr_off_cmp(const void *a, const void *b)
> +{
> +	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
> +
> +	if (off_desc1->offset < off_desc2->offset)
> +		return -1;
> +	else if (off_desc1->offset > off_desc2->offset)
> +		return 1;
> +	return 0;
> +}
> +
> +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
> +{
> +	/* Since members are iterated in btf_find_field in increasing order,
> +	 * offsets appended to kptr_off_tab are in increasing order, so we can
> +	 * do bsearch to find exact match.
> +	 */
> +	struct bpf_map_value_off *tab;
> +
> +	if (!map_value_has_kptrs(map))
> +		return NULL;
> +	tab = map->kptr_off_tab;
> +	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
> +}
> +
> +void bpf_map_free_kptr_off_tab(struct bpf_map *map)
> +{
> +	struct bpf_map_value_off *tab = map->kptr_off_tab;
> +	int i;
> +
> +	if (!map_value_has_kptrs(map))
> +		return;
> +	for (i = 0; i < tab->nr_off; i++) {
> +		struct btf *btf = tab->off[i].kptr.btf;
> +
> +		btf_put(btf);

why not to do: btf_put(tab->off[i].kptr.btf);

> +	}
> +	kfree(tab);
> +	map->kptr_off_tab = NULL;
> +}
> +
> +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
> +{
> +	struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
> +	int size, i, ret;
> +
> +	if (!map_value_has_kptrs(map))
> +		return ERR_PTR(-ENOENT);
> +	/* Do a deep copy of the kptr_off_tab */
> +	for (i = 0; i < tab->nr_off; i++)
> +		btf_get(tab->off[i].kptr.btf);
> +
> +	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
> +	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
> +	if (!new_tab) {
> +		ret = -ENOMEM;
> +		goto end;
> +	}
> +	return new_tab;
> +end:
> +	while (i--)
> +		btf_put(tab->off[i].kptr.btf);

Why do this get/put dance?
Isn't it equivalent to do kmemdup first and then for() btf_get?
kptr_off_tab is not going away and btfs are not going away either.
There is no race.

> +	return ERR_PTR(ret);
> +}
> +
> +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
> +{
> +	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
> +	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
> +	int size;
> +
> +	if (!a_has_kptr && !b_has_kptr)
> +		return true;
> +	if (a_has_kptr != b_has_kptr)
> +		return false;
> +	if (tab_a->nr_off != tab_b->nr_off)
> +		return false;
> +	size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
> +	return !memcmp(tab_a, tab_b, size);
> +}
> +
>  /* called from workqueue */
>  static void bpf_map_free_deferred(struct work_struct *work)
>  {
>  	struct bpf_map *map = container_of(work, struct bpf_map, work);
>  
>  	security_bpf_map_free(map);
> +	bpf_map_free_kptr_off_tab(map);
>  	bpf_map_release_memcg(map);
>  	/* implementation dependent freeing */
>  	map->ops->map_free(map);
> @@ -640,7 +723,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
>  	int err;
>  
>  	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
> -	    map_value_has_timer(map))
> +	    map_value_has_timer(map) || map_value_has_kptrs(map))
>  		return -ENOTSUPP;
>  
>  	if (!(vma->vm_flags & VM_SHARED))
> @@ -820,9 +903,33 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
>  			return -EOPNOTSUPP;
>  	}
>  
> -	if (map->ops->map_check_btf)
> +	map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
> +	if (map_value_has_kptrs(map)) {
> +		if (!bpf_capable()) {
> +			ret = -EPERM;
> +			goto free_map_tab;
> +		}
> +		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
> +			ret = -EACCES;
> +			goto free_map_tab;
> +		}
> +		if (map->map_type != BPF_MAP_TYPE_HASH &&
> +		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
> +		    map->map_type != BPF_MAP_TYPE_ARRAY) {
> +			ret = -EOPNOTSUPP;
> +			goto free_map_tab;
> +		}
> +	}
> +
> +	if (map->ops->map_check_btf) {
>  		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
> +		if (ret < 0)
> +			goto free_map_tab;
> +	}
>  
> +	return ret;
> +free_map_tab:
> +	bpf_map_free_kptr_off_tab(map);
>  	return ret;
>  }
>  
> @@ -1639,7 +1746,7 @@ static int map_freeze(const union bpf_attr *attr)
>  		return PTR_ERR(map);
>  
>  	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
> -	    map_value_has_timer(map)) {
> +	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
>  		fdput(f);
>  		return -ENOTSUPP;
>  	}
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 71827d14724a..c802e51c4e18 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3211,7 +3211,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
>  	return 0;
>  }
>  
> -enum stack_access_src {
> +enum bpf_access_src {
>  	ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
>  	ACCESS_HELPER = 2,  /* the access is performed by a helper */
>  };
> @@ -3219,7 +3219,7 @@ enum stack_access_src {
>  static int check_stack_range_initialized(struct bpf_verifier_env *env,
>  					 int regno, int off, int access_size,
>  					 bool zero_size_allowed,
> -					 enum stack_access_src type,
> +					 enum bpf_access_src type,
>  					 struct bpf_call_arg_meta *meta);
>  
>  static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
> @@ -3507,9 +3507,87 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
>  	return __check_ptr_off_reg(env, reg, regno, false);
>  }
>  
> +static int map_kptr_match_type(struct bpf_verifier_env *env,
> +			       struct bpf_map_value_off_desc *off_desc,
> +			       struct bpf_reg_state *reg, u32 regno)
> +{
> +	const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
> +	const char *reg_name = "";
> +
> +	if (base_type(reg->type) != PTR_TO_BTF_ID || type_flag(reg->type) != PTR_MAYBE_NULL)
> +		goto bad_type;
> +
> +	if (!btf_is_kernel(reg->btf)) {
> +		verbose(env, "R%d must point to kernel BTF\n", regno);
> +		return -EINVAL;
> +	}
> +	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
> +	reg_name = kernel_type_name(reg->btf, reg->btf_id);
> +
> +	if (__check_ptr_off_reg(env, reg, regno, true))
> +		return -EACCES;
> +
> +	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
> +				  off_desc->kptr.btf, off_desc->kptr.btf_id))
> +		goto bad_type;

Is full type comparison really needed?
reg->btf should be the same pointer as off_desc->kptr.btf
and btf_id should match exactly.
Is this a feature proofing for some day when registers with PTR_TO_BTF_ID type
will start pointing to prog's btf?

> +	return 0;
> +bad_type:
> +	verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
> +		reg_type_str(env, reg->type), reg_name);
> +	verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
> +	return -EINVAL;
> +}
> +
> +static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> +				 int value_regno, int insn_idx,
> +				 struct bpf_map_value_off_desc *off_desc)
> +{
> +	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
> +	int class = BPF_CLASS(insn->code);
> +	struct bpf_reg_state *val_reg;
> +
> +	/* Things we already checked for in check_map_access and caller:
> +	 *  - Reject cases where variable offset may touch kptr
> +	 *  - size of access (must be BPF_DW)
> +	 *  - tnum_is_const(reg->var_off)
> +	 *  - off_desc->offset == off + reg->var_off.value
> +	 */
> +	/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
> +	if (BPF_MODE(insn->code) != BPF_MEM) {
> +		verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
> +		return -EACCES;
> +	}
> +
> +	if (class == BPF_LDX) {
> +		val_reg = reg_state(env, value_regno);
> +		/* We can simply mark the value_regno receiving the pointer
> +		 * value from map as PTR_TO_BTF_ID, with the correct type.
> +		 */
> +		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
> +				off_desc->kptr.btf_id, PTR_MAYBE_NULL);
> +		val_reg->id = ++env->id_gen;

why non zero id this needed?