Re: [PATCH bpf-next v2 02/20] bpf: Parse bpf_dynptr in map key

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Jan 25, 2025 at 2:59 AM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote:
>
> From: Hou Tao <houtao1@xxxxxxxxxx>
>
> To support variable-length key or strings in map key, use bpf_dynptr to
> represent these variable-length objects and save these bpf_dynptr
> fields in the map key. As shown in the examples below, a map key with an
> integer and a string is defined:
>
>         struct pid_name {
>                 int pid;
>                 struct bpf_dynptr name;
>         };
>
> The bpf_dynptr in the map key could also be contained indirectly in a
> struct as shown below:
>
>         struct pid_name_time {
>                 struct pid_name process;
>                 unsigned long long time;
>         };
>
> If the whole map key is a bpf_dynptr, the map could be defined as a
> struct or directly using bpf_dynptr as the map key:
>
>         struct map_key {
>                 struct bpf_dynptr name;
>         };
>
> The bpf program could use bpf_dynptr_init() to initialize the dynptr
> part in the map key, and the userspace application will use
> bpf_dynptr_user_init() or similar API to initialize the dynptr. Just
> like kptrs in map value, the bpf_dynptr field in the map key could also
> be defined in a nested struct which is contained in the map key struct.
>
> The patch updates map_create() accordingly to parse these bpf_dynptr
> fields in map key, just like it does for other special fields in map
> value. To enable bpf_dynptr support in map key, the map_type should be
> BPF_MAP_TYPE_HASH. For now, the max number of bpf_dynptr in a map key
> is limited as 1 and the limitation can be relaxed later.
>
> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
> ---
>  include/linux/bpf.h     | 14 ++++++++++++++
>  kernel/bpf/btf.c        |  4 ++++
>  kernel/bpf/map_in_map.c | 21 +++++++++++++++++----
>  kernel/bpf/syscall.c    | 41 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 76 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0ee14ae30100f..ed58d5dd6b34b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -271,7 +271,14 @@ struct bpf_map {
>         u64 map_extra; /* any per-map-type extra fields */
>         u32 map_flags;
>         u32 id;
> +       /* BTF record for special fields in map value. bpf_dynptr is disallowed
> +        * at present.
> +        */

Maybe drop 'at present' to fit on one line.
I would also capitalize Value to make the difference more obvious...

>         struct btf_record *record;
> +       /* BTF record for special fields in map key. Only bpf_dynptr is allowed
> +        * at present.

...with this line. Key.

> +        */
> +       struct btf_record *key_record;
>         int numa_node;
>         u32 btf_key_type_id;
>         u32 btf_value_type_id;
> @@ -336,6 +343,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
>                 return "bpf_rb_node";
>         case BPF_REFCOUNT:
>                 return "bpf_refcount";
> +       case BPF_DYNPTR:
> +               return "bpf_dynptr";
>         default:
>                 WARN_ON_ONCE(1);
>                 return "unknown";
> @@ -366,6 +375,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
>                 return sizeof(struct bpf_rb_node);
>         case BPF_REFCOUNT:
>                 return sizeof(struct bpf_refcount);
> +       case BPF_DYNPTR:
> +               return sizeof(struct bpf_dynptr);
>         default:
>                 WARN_ON_ONCE(1);
>                 return 0;
> @@ -396,6 +407,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
>                 return __alignof__(struct bpf_rb_node);
>         case BPF_REFCOUNT:
>                 return __alignof__(struct bpf_refcount);
> +       case BPF_DYNPTR:
> +               return __alignof__(struct bpf_dynptr);
>         default:
>                 WARN_ON_ONCE(1);
>                 return 0;
> @@ -426,6 +439,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
>         case BPF_KPTR_REF:
>         case BPF_KPTR_PERCPU:
>         case BPF_UPTR:
> +       case BPF_DYNPTR:
>                 break;
>         default:
>                 WARN_ON_ONCE(1);
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index b316631b614fa..0ce5180e024a3 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3500,6 +3500,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
>         field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
>         field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node");
>         field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");
> +       field_mask_test_name(BPF_DYNPTR,    "bpf_dynptr");
>
>         /* Only return BPF_KPTR when all other types with matchable names fail */
>         if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
> @@ -3538,6 +3539,7 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
>                 case BPF_UPTR:
>                 case BPF_LIST_HEAD:
>                 case BPF_RB_ROOT:
> +               case BPF_DYNPTR:
>                         break;
>                 default:
>                         return -EINVAL;
> @@ -3660,6 +3662,7 @@ static int btf_find_field_one(const struct btf *btf,
>         case BPF_LIST_NODE:
>         case BPF_RB_NODE:
>         case BPF_REFCOUNT:
> +       case BPF_DYNPTR:
>                 ret = btf_find_struct(btf, var_type, off, sz, field_type,
>                                       info_cnt ? &info[0] : &tmp);
>                 if (ret < 0)
> @@ -4017,6 +4020,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
>                         break;
>                 case BPF_LIST_NODE:
>                 case BPF_RB_NODE:
> +               case BPF_DYNPTR:
>                         break;
>                 default:
>                         ret = -EFAULT;
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index 645bd30bc9a9d..564ebcc857564 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>         struct bpf_map *inner_map, *inner_map_meta;
>         u32 inner_map_meta_size;
>         CLASS(fd, f)(inner_map_ufd);
> +       int ret;
>
>         inner_map = __bpf_map_get(f);
>         if (IS_ERR(inner_map))
> @@ -45,10 +46,15 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>                  * invalid/empty/valid, but ERR_PTR in case of errors. During
>                  * equality NULL or IS_ERR is equivalent.
>                  */
> -               struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
> -               kfree(inner_map_meta);
> -               return ret;
> +               ret = PTR_ERR(inner_map_meta->record);
> +               goto free_meta;
>         }
> +       inner_map_meta->key_record = btf_record_dup(inner_map->key_record);
> +       if (IS_ERR(inner_map_meta->key_record)) {
> +               ret = PTR_ERR(inner_map_meta->key_record);
> +               goto free_record;
> +       }
> +
>         /* Note: We must use the same BTF, as we also used btf_record_dup above
>          * which relies on BTF being same for both maps, as some members like
>          * record->fields.list_head have pointers like value_rec pointing into
> @@ -71,6 +77,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>                 inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
>         }
>         return inner_map_meta;
> +
> +free_record:
> +       btf_record_free(inner_map_meta->record);
> +free_meta:
> +       kfree(inner_map_meta);
> +       return ERR_PTR(ret);
>  }
>
>  void bpf_map_meta_free(struct bpf_map *map_meta)
> @@ -88,7 +100,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
>                 meta0->key_size == meta1->key_size &&
>                 meta0->value_size == meta1->value_size &&
>                 meta0->map_flags == meta1->map_flags &&
> -               btf_record_equal(meta0->record, meta1->record);
> +               btf_record_equal(meta0->record, meta1->record) &&
> +               btf_record_equal(meta0->key_record, meta1->key_record);
>  }
>
>  void *bpf_map_fd_get_ptr(struct bpf_map *map,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0daf098e32074..6e14208cca813 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -651,6 +651,7 @@ void btf_record_free(struct btf_record *rec)
>                 case BPF_TIMER:
>                 case BPF_REFCOUNT:
>                 case BPF_WORKQUEUE:
> +               case BPF_DYNPTR:
>                         /* Nothing to release */
>                         break;
>                 default:
> @@ -664,7 +665,9 @@ void btf_record_free(struct btf_record *rec)
>  void bpf_map_free_record(struct bpf_map *map)
>  {
>         btf_record_free(map->record);
> +       btf_record_free(map->key_record);
>         map->record = NULL;
> +       map->key_record = NULL;
>  }
>
>  struct btf_record *btf_record_dup(const struct btf_record *rec)
> @@ -703,6 +706,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
>                 case BPF_TIMER:
>                 case BPF_REFCOUNT:
>                 case BPF_WORKQUEUE:
> +               case BPF_DYNPTR:
>                         /* Nothing to acquire */
>                         break;
>                 default:
> @@ -821,6 +825,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>                 case BPF_RB_NODE:
>                 case BPF_REFCOUNT:
>                         break;
> +               case BPF_DYNPTR:
> +                       break;
>                 default:
>                         WARN_ON_ONCE(1);
>                         continue;
> @@ -830,6 +836,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>
>  static void bpf_map_free(struct bpf_map *map)
>  {
> +       struct btf_record *key_rec = map->key_record;
>         struct btf_record *rec = map->record;
>         struct btf *btf = map->btf;
>
> @@ -850,6 +857,7 @@ static void bpf_map_free(struct bpf_map *map)
>          * eventually calls bpf_map_free_meta, since inner_map_meta is only a
>          * template bpf_map struct used during verification.
>          */
> +       btf_record_free(key_rec);
>         btf_record_free(rec);
>         /* Delay freeing of btf for maps, as map_free callback may need
>          * struct_meta info which will be freed with btf_put().
> @@ -1180,6 +1188,8 @@ int map_check_no_btf(const struct bpf_map *map,
>         return -ENOTSUPP;
>  }
>
> +#define MAX_DYNPTR_CNT_IN_MAP_KEY 1

I remember we discussed to allow 2 dynptr-s in a key.
And in patch 11 you already do:
+       record = map->key_record;
+       for (i = 0; i < record->cnt; i++) {

so the support for multiple dynptr-s is almost there?

> +
>  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
>  {
> @@ -1202,6 +1212,37 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>         if (!value_type || value_size != map->value_size)
>                 return -EINVAL;
>
> +       /* Key BTF type can't be data section */
> +       if (btf_type_is_dynptr(btf, key_type))
> +               map->key_record = btf_new_bpf_dynptr_record();
> +       else if (__btf_type_is_struct(key_type))
> +               map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
> +       else
> +               map->key_record = NULL;
> +       if (!IS_ERR_OR_NULL(map->key_record)) {
> +               if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
> +                       ret = -E2BIG;
> +                       goto free_map_tab;
> +               }
> +               if (map->map_type != BPF_MAP_TYPE_HASH) {
> +                       ret = -EOPNOTSUPP;
> +                       goto free_map_tab;
> +               }
> +               if (!bpf_token_capable(token, CAP_BPF)) {
> +                       ret = -EPERM;
> +                       goto free_map_tab;
> +               }
> +               /* Disallow key with dynptr for special map */
> +               if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
> +                       ret = -EACCES;
> +                       goto free_map_tab;
> +               }
> +       } else if (IS_ERR(map->key_record)) {
> +               /* Return an error early even the bpf program doesn't use it */
> +               ret = PTR_ERR(map->key_record);
> +               goto free_map_tab;
> +       }
> +
>         map->record = btf_parse_fields(btf, value_type,
>                                        BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
>                                        BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
> --
> 2.29.2
>





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux