Re: [PATCH RFC bpf-next v1 13/32] bpf: Introduce bpf_list_head support for BPF maps

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Sep 04, 2022 at 10:41:26PM +0200, Kumar Kartikeya Dwivedi wrote:
> Add the basic support on the map side to parse, recognize, verify, and
> build metadata table for a new special field of the type struct
> bpf_list_head. To parameterize the bpf_list_head for a certain value
> type and the list_node member it will accept in that value type, we use
> BTF declaration tags.
> 
> The definition of bpf_list_head in a map value will be done as follows:
> 
> struct foo {
> 	int data;
> 	struct bpf_list_node list;
> };
> 
> struct map_value {
> 	struct bpf_list_head list __contains(struct, foo, node);
> };

kptrs are only for structs.
So I would drop explicit 1st argument which is going to be 'struct'
for foreseeable future and leave it as:
 struct bpf_list_head list __contains(foo, node);

There is typo s/list;/node;/ in struct foo, right?

> Then, the bpf_list_head only allows adding to the list using the
> bpf_list_node 'list' for the type struct foo.
> 
> The 'contains' annotation is a BTF declaration tag composed of four
> parts, "contains:kind:name:node" where the kind and name is then used to
> look up the type in the map BTF. The node defines name of the member in
> this type that has the type struct bpf_list_node, which is actually used
> for linking into the linked list.
> 
> This allows building intrusive linked lists in BPF, using container_of
> to obtain pointer to entry, while being completely type safe from the
> perspective of the verifier. The verifier knows exactly the type of the
> nodes, and knows that list helpers return that type at some fixed offset
> where the bpf_list_node member used for this list exists. The verifier
> also uses this information to disallow adding types that are not
> accepted by a certain list.
> 
> For now, no elements can be added to such lists. Support for that is
> coming in future patches, hence draining and freeing items is left out
> for now, and just freeing the list_head_off_tab is done, since it is
> still built and populated when bpf_list_head is specified in the map
> value.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx>
> ---
>  include/linux/bpf.h                           |  64 +++++--
>  include/linux/btf.h                           |   2 +
>  kernel/bpf/arraymap.c                         |   2 +
>  kernel/bpf/bpf_local_storage.c                |   1 +
>  kernel/bpf/btf.c                              | 173 +++++++++++++++++-
>  kernel/bpf/hashtab.c                          |   1 +
>  kernel/bpf/map_in_map.c                       |   5 +-
>  kernel/bpf/syscall.c                          | 131 +++++++++++--
>  kernel/bpf/verifier.c                         |  21 +++
>  .../testing/selftests/bpf/bpf_experimental.h  |  21 +++
>  10 files changed, 378 insertions(+), 43 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/bpf_experimental.h
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index d4e6bf789c02..35c2e9caeb98 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -28,6 +28,9 @@
>  #include <linux/btf.h>
>  #include <linux/rcupdate_trace.h>
>  
> +/* Experimental BPF APIs header for type definitions */
> +#include "../../../tools/testing/selftests/bpf/bpf_experimental.h"
> +
>  struct bpf_verifier_env;
>  struct bpf_verifier_log;
>  struct perf_event;
> @@ -164,27 +167,40 @@ struct bpf_map_ops {
>  };
>  
>  enum {
> -	/* Support at most 8 pointers in a BPF map value */
> -	BPF_MAP_VALUE_OFF_MAX = 8,
> -	BPF_MAP_OFF_ARR_MAX   = BPF_MAP_VALUE_OFF_MAX +
> -				1 + /* for bpf_spin_lock */
> -				1,  /* for bpf_timer */
> -};
> -
> -enum bpf_kptr_type {
> +	/* Support at most 8 offsets in a table */
> +	BPF_MAP_VALUE_OFF_MAX		= 8,
> +	/* Support at most 8 pointer in a BPF map value */
> +	BPF_MAP_VALUE_KPTR_MAX		= BPF_MAP_VALUE_OFF_MAX,
> +	/* Support at most 8 list_head in a BPF map value */
> +	BPF_MAP_VALUE_LIST_HEAD_MAX	= BPF_MAP_VALUE_OFF_MAX,
> +	BPF_MAP_OFF_ARR_MAX		= BPF_MAP_VALUE_KPTR_MAX +
> +					  BPF_MAP_VALUE_LIST_HEAD_MAX +
> +					  1 + /* for bpf_spin_lock */
> +					  1,  /* for bpf_timer */
> +};
> +
> +enum bpf_off_type {
>  	BPF_KPTR_UNREF,
>  	BPF_KPTR_REF,
> +	BPF_LIST_HEAD,
>  };
>  
>  struct bpf_map_value_off_desc {
>  	u32 offset;
> -	enum bpf_kptr_type type;
> -	struct {
> -		struct btf *btf;
> -		struct module *module;
> -		btf_dtor_kfunc_t dtor;
> -		u32 btf_id;
> -	} kptr;
> +	enum bpf_off_type type;
> +	union {
> +		struct {
> +			struct btf *btf;
> +			struct module *module;
> +			btf_dtor_kfunc_t dtor;
> +			u32 btf_id;
> +		} kptr; /* for BPF_KPTR_{UNREF,REF} */
> +		struct {
> +			struct btf *btf;
> +			u32 value_type_id;
> +			u32 list_node_off;
> +		} list_head; /* for BPF_LIST_HEAD */
> +	};
>  };
>  
>  struct bpf_map_value_off {
> @@ -215,6 +231,7 @@ struct bpf_map {
>  	u32 map_flags;
>  	int spin_lock_off; /* >=0 valid offset, <0 error */
>  	struct bpf_map_value_off *kptr_off_tab;
> +	struct bpf_map_value_off *list_head_off_tab;

The union in bpf_map_value_off_desc prompts the question
why separate array is needed.
Sorting gets uglier.

>  	int timer_off; /* >=0 valid offset, <0 error */
>  	u32 id;
>  	int numa_node;
> @@ -265,6 +282,11 @@ static inline bool map_value_has_kptrs(const struct bpf_map *map)
>  	return !IS_ERR_OR_NULL(map->kptr_off_tab);
>  }
>  
> +static inline bool map_value_has_list_heads(const struct bpf_map *map)
> +{
> +	return !IS_ERR_OR_NULL(map->list_head_off_tab);
> +}
> +
>  static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
>  {
>  	if (unlikely(map_value_has_spin_lock(map)))
> @@ -278,6 +300,13 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
>  		for (i = 0; i < tab->nr_off; i++)
>  			*(u64 *)(dst + tab->off[i].offset) = 0;
>  	}
> +	if (unlikely(map_value_has_list_heads(map))) {
> +		struct bpf_map_value_off *tab = map->list_head_off_tab;
> +		int i;
> +
> +		for (i = 0; i < tab->nr_off; i++)
> +			memset(dst + tab->off[i].offset, 0, sizeof(struct list_head));
> +	}

Do we really need to distinguish map_value_has_kptrs vs map_value_has_list_heads ?
Can they be generalized?
rb_root will be next.
that would be yet another array and another 'if'-s everywhere?
And then another special pseudo-map type that will cause a bunch of copy-paste again?
Maybe it's inevitable.
Trying to brainstorm.

>  }
>  
>  /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
> @@ -1676,6 +1705,11 @@ struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map);
>  bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b);
>  void bpf_map_free_kptrs(struct bpf_map *map, void *map_value);
>  
> +struct bpf_map_value_off_desc *bpf_map_list_head_off_contains(struct bpf_map *map, u32 offset);
> +void bpf_map_free_list_head_off_tab(struct bpf_map *map);
> +struct bpf_map_value_off *bpf_map_copy_list_head_off_tab(const struct bpf_map *map);
> +bool bpf_map_equal_list_head_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b);
> +
>  struct bpf_map *bpf_map_get(u32 ufd);
>  struct bpf_map *bpf_map_get_with_uref(u32 ufd);
>  struct bpf_map *__bpf_map_get(struct fd f);
> diff --git a/include/linux/btf.h b/include/linux/btf.h
> index 8062f9da7c40..9b62b8b2117e 100644
> --- a/include/linux/btf.h
> +++ b/include/linux/btf.h
> @@ -156,6 +156,8 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
>  int btf_find_timer(const struct btf *btf, const struct btf_type *t);
>  struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
>  					  const struct btf_type *t);
> +struct bpf_map_value_off *btf_parse_list_heads(struct btf *btf,
> +					       const struct btf_type *t);
>  bool btf_type_is_void(const struct btf_type *t);
>  s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
>  const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index 832b2659e96e..c7263ee3a35f 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -423,6 +423,8 @@ static void array_map_free(struct bpf_map *map)
>  	struct bpf_array *array = container_of(map, struct bpf_array, map);
>  	int i;
>  
> +	bpf_map_free_list_head_off_tab(map);
> +
>  	if (map_value_has_kptrs(map)) {
>  		if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
>  			for (i = 0; i < array->map.max_entries; i++) {
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index 58cb0c179097..b5ccd76026b6 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -616,6 +616,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
>  		rcu_barrier();
>  		bpf_map_free_kptr_off_tab(&smap->map);
>  	}
> +	bpf_map_free_list_head_off_tab(&smap->map);
>  	kvfree(smap->buckets);
>  	bpf_map_area_free(smap);
>  }
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 6740c3ade8f1..0fb045be3837 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3185,6 +3185,7 @@ enum btf_field_type {
>  	BTF_FIELD_SPIN_LOCK,
>  	BTF_FIELD_TIMER,
>  	BTF_FIELD_KPTR,
> +	BTF_FIELD_LIST_HEAD,
>  };
>  
>  enum {
> @@ -3193,9 +3194,17 @@ enum {
>  };
>  
>  struct btf_field_info {
> -	u32 type_id;
>  	u32 off;
> -	enum bpf_kptr_type type;
> +	union {
> +		struct {
> +			u32 type_id;
> +			enum bpf_off_type type;
> +		} kptr;
> +		struct {
> +			u32 value_type_id;
> +			const char *node_name;
> +		} list_head;
> +	};
>  };
>  
>  static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
> @@ -3212,7 +3221,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
>  static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
>  			 u32 off, int sz, struct btf_field_info *info)
>  {
> -	enum bpf_kptr_type type;
> +	enum bpf_off_type type;
>  	u32 res_id;
>  
>  	/* Permit modifiers on the pointer itself */
> @@ -3241,9 +3250,71 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
>  	if (!__btf_type_is_struct(t))
>  		return -EINVAL;
>  
> -	info->type_id = res_id;
>  	info->off = off;
> -	info->type = type;
> +	info->kptr.type_id = res_id;
> +	info->kptr.type = type;
> +	return BTF_FIELD_FOUND;
> +}
> +
> +static const char *btf_find_decl_tag_value(const struct btf *btf,
> +					   const struct btf_type *pt,
> +					   int comp_idx, const char *tag_key)
> +{
> +	int i;
> +
> +	for (i = 1; i < btf_nr_types(btf); i++) {
> +		const struct btf_type *t = btf_type_by_id(btf, i);
> +		int len = strlen(tag_key);
> +
> +		if (!btf_type_is_decl_tag(t))
> +			continue;
> +		/* TODO: Instead of btf_type pt, it would be much better if we had BTF
> +		 * ID of the map value type. This would avoid btf_type_by_id call here.
> +		 */
> +		if (pt != btf_type_by_id(btf, t->type) ||
> +		    btf_type_decl_tag(t)->component_idx != comp_idx)
> +			continue;
> +		if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
> +			continue;
> +		return __btf_name_by_offset(btf, t->name_off) + len;
> +	}
> +	return NULL;
> +}
> +
> +static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
> +			      int comp_idx, const struct btf_type *t,
> +			      u32 off, int sz, struct btf_field_info *info)
> +{
> +	const char *value_type;
> +	const char *list_node;
> +	s32 id;
> +
> +	if (!__btf_type_is_struct(t))
> +		return BTF_FIELD_IGNORE;
> +	if (t->size != sz)
> +		return BTF_FIELD_IGNORE;
> +	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
> +	if (!value_type)
> +		return -EINVAL;
> +	if (strncmp(value_type, "struct:", sizeof("struct:") - 1))
> +		return -EINVAL;
> +	value_type += sizeof("struct:") - 1;
> +	list_node = strstr(value_type, ":");
> +	if (!list_node)
> +		return -EINVAL;
> +	value_type = kstrndup(value_type, list_node - value_type, GFP_ATOMIC);
> +	if (!value_type)
> +		return -ENOMEM;
> +	id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
> +	kfree(value_type);
> +	if (id < 0)
> +		return id;
> +	list_node++;
> +	if (str_is_empty(list_node))
> +		return -EINVAL;
> +	info->off = off;
> +	info->list_head.value_type_id = id;
> +	info->list_head.node_name = list_node;
>  	return BTF_FIELD_FOUND;
>  }
>  
> @@ -3286,6 +3357,12 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
>  			if (ret < 0)
>  				return ret;
>  			break;
> +		case BTF_FIELD_LIST_HEAD:
> +			ret = btf_find_list_head(btf, t, i, member_type, off, sz,
> +						 idx < info_cnt ? &info[idx] : &tmp);
> +			if (ret < 0)
> +				return ret;
> +			break;
>  		default:
>  			return -EFAULT;
>  		}
> @@ -3336,6 +3413,12 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
>  			if (ret < 0)
>  				return ret;
>  			break;
> +		case BTF_FIELD_LIST_HEAD:
> +			ret = btf_find_list_head(btf, var, -1, var_type, off, sz,
> +						 idx < info_cnt ? &info[idx] : &tmp);
> +			if (ret < 0)
> +				return ret;
> +			break;
>  		default:
>  			return -EFAULT;
>  		}
> @@ -3372,6 +3455,11 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
>  		sz = sizeof(u64);
>  		align = 8;
>  		break;
> +	case BTF_FIELD_LIST_HEAD:
> +		name = "bpf_list_head";
> +		sz = sizeof(struct bpf_list_head);
> +		align = __alignof__(struct bpf_list_head);
> +		break;
>  	default:
>  		return -EFAULT;
>  	}
> @@ -3440,7 +3528,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
>  		/* Find type in map BTF, and use it to look up the matching type
>  		 * in vmlinux or module BTFs, by name and kind.
>  		 */
> -		t = btf_type_by_id(btf, info_arr[i].type_id);
> +		t = btf_type_by_id(btf, info_arr[i].kptr.type_id);
>  		id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
>  				     &kernel_btf);
>  		if (id < 0) {
> @@ -3451,7 +3539,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
>  		/* Find and stash the function pointer for the destruction function that
>  		 * needs to be eventually invoked from the map free path.
>  		 */
> -		if (info_arr[i].type == BPF_KPTR_REF) {
> +		if (info_arr[i].kptr.type == BPF_KPTR_REF) {
>  			const struct btf_type *dtor_func;
>  			const char *dtor_func_name;
>  			unsigned long addr;
> @@ -3494,7 +3582,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
>  		}
>  
>  		tab->off[i].offset = info_arr[i].off;
> -		tab->off[i].type = info_arr[i].type;
> +		tab->off[i].type = info_arr[i].kptr.type;
>  		tab->off[i].kptr.btf_id = id;
>  		tab->off[i].kptr.btf = kernel_btf;
>  		tab->off[i].kptr.module = mod;
> @@ -3515,6 +3603,75 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
>  	return ERR_PTR(ret);
>  }
>  
> +struct bpf_map_value_off *btf_parse_list_heads(struct btf *btf, const struct btf_type *t)
> +{
> +	struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
> +	struct bpf_map_value_off *tab;
> +	int ret, i, nr_off;
> +
> +	ret = btf_find_field(btf, t, BTF_FIELD_LIST_HEAD, info_arr, ARRAY_SIZE(info_arr));

Like if search for both LIST_HEAD and KPTR here to know the size.

> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +	if (!ret)
> +		return NULL;
> +
> +	nr_off = ret;
> +	tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
> +	if (!tab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	for (i = 0; i < nr_off; i++) {
> +		const struct btf_type *t, *n = NULL;
> +		const struct btf_member *member;
> +		u32 offset;
> +		int j;

and here we can process both, since field_info has type.

> +
> +		t = btf_type_by_id(btf, info_arr[i].list_head.value_type_id);
> +		/* We've already checked that value_type_id is a struct type. We
> +		 * just need to figure out the offset of the list_node, and
> +		 * verify its type.
> +		 */
> +		ret = -EINVAL;
> +		for_each_member(j, t, member) {
> +			if (strcmp(info_arr[i].list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
> +				continue;
> +			/* Invalid BTF, two members with same name */
> +			if (n) {
> +				/* We also need to btf_put for the current iteration! */
> +				i++;
> +				goto end;
> +			}
> +			n = btf_type_by_id(btf, member->type);
> +			if (!__btf_type_is_struct(n))
> +				goto end;
> +			if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
> +				goto end;
> +			offset = __btf_member_bit_offset(n, member);
> +			if (offset % 8)
> +				goto end;
> +			offset /= 8;
> +			if (offset % __alignof__(struct bpf_list_node))
> +				goto end;
> +
> +			tab->off[i].offset = info_arr[i].off;
> +			tab->off[i].type = BPF_LIST_HEAD;
> +			btf_get(btf);

Do we need to btf_get? The btf should be pinned already and not going to be released
until prog ends.

> +			tab->off[i].list_head.btf = btf;
> +			tab->off[i].list_head.value_type_id = info_arr[i].list_head.value_type_id;
> +			tab->off[i].list_head.list_node_off = offset;
> +		}
> +		if (!n)
> +			goto end;
> +	}
> +	tab->nr_off = nr_off;
> +	return tab;
> +end:
> +	while (i--)
> +		btf_put(tab->off[i].list_head.btf);
> +	kfree(tab);
> +	return ERR_PTR(ret);
> +}
> +
>  static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
>  			      u32 type_id, void *data, u8 bits_offset,
>  			      struct btf_show *show)
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index bb3f8a63c221..270e0ecf4ba3 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -1518,6 +1518,7 @@ static void htab_map_free(struct bpf_map *map)
>  		prealloc_destroy(htab);
>  	}
>  
> +	bpf_map_free_list_head_off_tab(map);
>  	bpf_map_free_kptr_off_tab(map);
>  	free_percpu(htab->extra_elems);
>  	bpf_map_area_free(htab->buckets);
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index 135205d0d560..ced2559129ab 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -53,6 +53,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>  	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
>  	inner_map_meta->timer_off = inner_map->timer_off;
>  	inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
> +	inner_map_meta->list_head_off_tab = bpf_map_copy_list_head_off_tab(inner_map);
>  	if (inner_map->btf) {
>  		btf_get(inner_map->btf);
>  		inner_map_meta->btf = inner_map->btf;
> @@ -72,6 +73,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>  
>  void bpf_map_meta_free(struct bpf_map *map_meta)
>  {
> +	bpf_map_free_list_head_off_tab(map_meta);
>  	bpf_map_free_kptr_off_tab(map_meta);
>  	btf_put(map_meta->btf);
>  	kfree(map_meta);
> @@ -86,7 +88,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
>  		meta0->value_size == meta1->value_size &&
>  		meta0->timer_off == meta1->timer_off &&
>  		meta0->map_flags == meta1->map_flags &&
> -		bpf_map_equal_kptr_off_tab(meta0, meta1);
> +		bpf_map_equal_kptr_off_tab(meta0, meta1) &&
> +		bpf_map_equal_list_head_off_tab(meta0, meta1);
>  }
>  
>  void *bpf_map_fd_get_ptr(struct bpf_map *map,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0311acca19f6..e1749e0d2143 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -495,7 +495,7 @@ static void bpf_map_release_memcg(struct bpf_map *map)
>  }
>  #endif
>  
> -static int bpf_map_kptr_off_cmp(const void *a, const void *b)
> +static int bpf_map_off_cmp(const void *a, const void *b)
>  {
>  	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
>  
> @@ -506,18 +506,22 @@ static int bpf_map_kptr_off_cmp(const void *a, const void *b)
>  	return 0;
>  }
>  
> -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
> +static struct bpf_map_value_off_desc *
> +__bpf_map_off_contains(struct bpf_map_value_off *off_tab, u32 offset)
>  {
>  	/* Since members are iterated in btf_find_field in increasing order,
> -	 * offsets appended to kptr_off_tab are in increasing order, so we can
> +	 * offsets appended to an off_tab are in increasing order, so we can
>  	 * do bsearch to find exact match.
>  	 */
> -	struct bpf_map_value_off *tab;
> +	return bsearch(&offset, off_tab->off, off_tab->nr_off, sizeof(off_tab->off[0]),
> +		       bpf_map_off_cmp);
> +}
>  
> +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
> +{
>  	if (!map_value_has_kptrs(map))
>  		return NULL;
> -	tab = map->kptr_off_tab;
> -	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
> +	return __bpf_map_off_contains(map->kptr_off_tab, offset);
>  }
>  
>  void bpf_map_free_kptr_off_tab(struct bpf_map *map)
> @@ -563,15 +567,15 @@ struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
>  	return new_tab;
>  }
>  
> -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
> +static bool __bpf_map_equal_off_tab(const struct bpf_map_value_off *tab_a,
> +				    const struct bpf_map_value_off *tab_b,
> +				    bool has_a, bool has_b)
>  {
> -	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
> -	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
>  	int size;
>  
> -	if (!a_has_kptr && !b_has_kptr)
> +	if (!has_a && !has_b)
>  		return true;
> -	if (a_has_kptr != b_has_kptr)
> +	if (has_a != has_b)
>  		return false;
>  	if (tab_a->nr_off != tab_b->nr_off)
>  		return false;
> @@ -579,6 +583,13 @@ bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_ma
>  	return !memcmp(tab_a, tab_b, size);
>  }
>  
> +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
> +{
> +	return __bpf_map_equal_off_tab(map_a->kptr_off_tab, map_b->kptr_off_tab,
> +				       map_value_has_kptrs(map_a),
> +				       map_value_has_kptrs(map_b));
> +}
> +
>  /* Caller must ensure map_value_has_kptrs is true. Note that this function can
>   * be called on a map value while the map_value is visible to BPF programs, as
>   * it ensures the correct synchronization, and we already enforce the same using
> @@ -606,6 +617,50 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
>  	}
>  }
>  
> +struct bpf_map_value_off_desc *bpf_map_list_head_off_contains(struct bpf_map *map, u32 offset)
> +{
> +	if (!map_value_has_list_heads(map))
> +		return NULL;
> +	return __bpf_map_off_contains(map->list_head_off_tab, offset);
> +}
> +
> +void bpf_map_free_list_head_off_tab(struct bpf_map *map)
> +{
> +	struct bpf_map_value_off *tab = map->list_head_off_tab;
> +	int i;
> +
> +	if (!map_value_has_list_heads(map))
> +		return;
> +	for (i = 0; i < tab->nr_off; i++)
> +		btf_put(tab->off[i].list_head.btf);
> +	kfree(tab);
> +	map->list_head_off_tab = NULL;
> +}
> +
> +struct bpf_map_value_off *bpf_map_copy_list_head_off_tab(const struct bpf_map *map)
> +{
> +	struct bpf_map_value_off *tab = map->list_head_off_tab, *new_tab;
> +	int size, i;
> +
> +	if (!map_value_has_list_heads(map))
> +		return ERR_PTR(-ENOENT);
> +	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
> +	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
> +	if (!new_tab)
> +		return ERR_PTR(-ENOMEM);
> +	/* Do a deep copy of the list_head_off_tab */
> +	for (i = 0; i < tab->nr_off; i++)
> +		btf_get(tab->off[i].list_head.btf);
> +	return new_tab;
> +}
> +
> +bool bpf_map_equal_list_head_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
> +{
> +	return __bpf_map_equal_off_tab(map_a->list_head_off_tab, map_b->list_head_off_tab,
> +				       map_value_has_list_heads(map_a),
> +				       map_value_has_list_heads(map_b));
> +}
> +
>  /* called from workqueue */
>  static void bpf_map_free_deferred(struct work_struct *work)
>  {
> @@ -776,7 +831,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
>  	int err;
>  
>  	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
> -	    map_value_has_timer(map) || map_value_has_kptrs(map))
> +	    map_value_has_timer(map) || map_value_has_kptrs(map) ||
> +	    map_value_has_list_heads(map))
>  		return -ENOTSUPP;
>  
>  	if (!(vma->vm_flags & VM_SHARED))
> @@ -931,13 +987,14 @@ static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
>  
>  static int bpf_map_alloc_off_arr(struct bpf_map *map)
>  {
> +	bool has_list_heads = map_value_has_list_heads(map);
>  	bool has_spin_lock = map_value_has_spin_lock(map);
>  	bool has_timer = map_value_has_timer(map);
>  	bool has_kptrs = map_value_has_kptrs(map);
>  	struct bpf_map_off_arr *off_arr;
>  	u32 i;
>  
> -	if (!has_spin_lock && !has_timer && !has_kptrs) {
> +	if (!has_spin_lock && !has_timer && !has_kptrs && !has_list_heads) {
>  		map->off_arr = NULL;
>  		return 0;
>  	}
> @@ -973,6 +1030,17 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map)
>  		}
>  		off_arr->cnt += tab->nr_off;
>  	}
> +	if (has_list_heads) {
> +		struct bpf_map_value_off *tab = map->list_head_off_tab;
> +		u32 *off = &off_arr->field_off[off_arr->cnt];
> +		u8 *sz = &off_arr->field_sz[off_arr->cnt];
> +
> +		for (i = 0; i < tab->nr_off; i++) {
> +			*off++ = tab->off[i].offset;
> +			*sz++ = sizeof(struct bpf_list_head);
> +		}
> +		off_arr->cnt += tab->nr_off;
> +	}
>  
>  	if (off_arr->cnt == 1)
>  		return 0;
> @@ -1038,11 +1106,11 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
>  	if (map_value_has_kptrs(map)) {
>  		if (!bpf_capable()) {
>  			ret = -EPERM;
> -			goto free_map_tab;
> +			goto free_map_kptr_tab;
>  		}
>  		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
>  			ret = -EACCES;
> -			goto free_map_tab;
> +			goto free_map_kptr_tab;
>  		}
>  		if (map->map_type != BPF_MAP_TYPE_HASH &&
>  		    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
> @@ -1054,18 +1122,42 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
>  		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
>  		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
>  			ret = -EOPNOTSUPP;
> -			goto free_map_tab;
> +			goto free_map_kptr_tab;
> +		}
> +	}
> +
> +	/* We need to take ref on the BTF, so pass it as non-const */
> +	map->list_head_off_tab = btf_parse_list_heads((struct btf *)btf, value_type);
> +	if (map_value_has_list_heads(map)) {
> +		if (!bpf_capable()) {
> +			ret = -EACCES;
> +			goto free_map_list_head_tab;
> +		}
> +		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
> +			ret = -EACCES;
> +			goto free_map_list_head_tab;
> +		}
> +		if (map->map_type != BPF_MAP_TYPE_HASH &&
> +		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
> +		    map->map_type != BPF_MAP_TYPE_ARRAY &&
> +		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
> +		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
> +		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
> +			ret = -EOPNOTSUPP;
> +			goto free_map_list_head_tab;
>  		}
>  	}
>  
>  	if (map->ops->map_check_btf) {
>  		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
>  		if (ret < 0)
> -			goto free_map_tab;
> +			goto free_map_list_head_tab;
>  	}
>  
>  	return ret;
> -free_map_tab:
> +free_map_list_head_tab:
> +	bpf_map_free_list_head_off_tab(map);
> +free_map_kptr_tab:
>  	bpf_map_free_kptr_off_tab(map);
>  	return ret;
>  }
> @@ -1889,7 +1981,8 @@ static int map_freeze(const union bpf_attr *attr)
>  		return PTR_ERR(map);
>  
>  	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
> -	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
> +	    map_value_has_timer(map) || map_value_has_kptrs(map) ||
> +	    map_value_has_list_heads(map)) {
>  		fdput(f);
>  		return -ENOTSUPP;
>  	}
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 571790ac58d4..ab91e5ca7e41 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3879,6 +3879,20 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
>  			}
>  		}
>  	}
> +	if (map_value_has_list_heads(map)) {
> +		struct bpf_map_value_off *tab = map->list_head_off_tab;
> +		int i;
> +
> +		for (i = 0; i < tab->nr_off; i++) {
> +			u32 p = tab->off[i].offset;
> +
> +			if (reg->smin_value + off < p + sizeof(struct bpf_list_head) &&
> +			    p < reg->umax_value + off + size) {
> +				verbose(env, "bpf_list_head cannot be accessed directly by load/store\n");
> +				return -EACCES;
> +			}
> +		}
> +	}
>  	return err;
>  }
>  
> @@ -13165,6 +13179,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
>  		}
>  	}
>  
> +	if (map_value_has_list_heads(map)) {
> +		if (is_tracing_prog_type(prog_type)) {
> +			verbose(env, "tracing progs cannot use bpf_list_head yet\n");
> +			return -EINVAL;
> +		}
> +	}
> +
>  	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
>  	    !bpf_offload_prog_map_match(prog, map)) {
>  		verbose(env, "offload device mismatch between prog and map\n");
> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
> new file mode 100644
> index 000000000000..ea1b3b1839d1
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/bpf_experimental.h
> @@ -0,0 +1,21 @@
> +#ifndef __KERNEL__
> +
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_helpers.h>
> +
> +#else
> +
> +struct bpf_list_head {
> +	__u64 __a;
> +	__u64 __b;
> +} __attribute__((aligned(8)));
> +
> +struct bpf_list_node {
> +	__u64 __a;
> +	__u64 __b;
> +} __attribute__((aligned(8)));
> +
> +#endif
> +
> +#ifndef __KERNEL__
> +#endif
> -- 
> 2.34.1
> 



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux