Re: [PATCH bpf-next v3 03/13] bpf: Allow storing unreferenced kptr in map

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Mon, 21 Mar 2022 22:45:42 -0700

On Sun, Mar 20, 2022 at 8:55 AM Kumar Kartikeya Dwivedi
<memxor@xxxxxxxxx> wrote:
>
> This commit introduces a new pointer type 'kptr' which can be embedded
> in a map value as holds a PTR_TO_BTF_ID stored by a BPF program during
> its invocation. Storing to such a kptr, BPF program's PTR_TO_BTF_ID
> register must have the same type as in the map value's BTF, and loading
> a kptr marks the destination register as PTR_TO_BTF_ID with the correct
> kernel BTF and BTF ID.
>
> Such kptr are unreferenced, i.e. by the time another invocation of the
> BPF program loads this pointer, the object which the pointer points to
> may not longer exist. Since PTR_TO_BTF_ID loads (using BPF_LDX) are
> patched to PROBE_MEM loads by the verifier, it would safe to allow user
> to still access such invalid pointer, but passing such pointers into
> BPF helpers and kfuncs should not be permitted. A future patch in this
> series will close this gap.
>
> The flexibility offered by allowing programs to dereference such invalid
> pointers while being safe at runtime frees the verifier from doing
> complex lifetime tracking. As long as the user may ensure that the
> object remains valid, it can ensure data read by it from the kernel
> object is valid.
>
> The user indicates that a certain pointer must be treated as kptr
> capable of accepting stores of PTR_TO_BTF_ID of a certain type, by using
> a BTF type tag 'kptr' on the pointed to type of the pointer. Then, this
> information is recorded in the object BTF which will be passed into the
> kernel by way of map's BTF information. The name and kind from the map
> value BTF is used to look up the in-kernel type, and the actual BTF and
> BTF ID is recorded in the map struct in a new kptr_off_tab member. For
> now, only storing pointers to structs is permitted.
>
> An example of this specification is shown below:
>
>         #define __kptr __attribute__((btf_type_tag("kptr")))
>
>         struct map_value {
>                 ...
>                 struct task_struct __kptr *task;
>                 ...
>         };
>
> Then, in a BPF program, user may store PTR_TO_BTF_ID with the type
> task_struct into the map, and then load it later.
>
> Note that the destination register is marked PTR_TO_BTF_ID_OR_NULL, as
> the verifier cannot know whether the value is NULL or not statically, it
> must treat all potential loads at that map value offset as loading a
> possibly NULL pointer.
>
> Only BPF_LDX, BPF_STX, and BPF_ST with insn->imm = 0 (to denote NULL)
> are allowed instructions that can access such a pointer. On BPF_LDX, the
> destination register is updated to be a PTR_TO_BTF_ID, and on BPF_STX,
> it is checked whether the source register type is a PTR_TO_BTF_ID with
> same BTF type as specified in the map BTF. The access size must always
> be BPF_DW.
>
> For the map in map support, the kptr_off_tab for outer map is copied
> from the inner map's kptr_off_tab. It was chosen to do a deep copy
> instead of introducing a refcount to kptr_off_tab, because the copy only
> needs to be done when paramterizing using inner_map_fd in the map in map
> case, hence would be unnecessary for all other users.
>
> It is not permitted to use MAP_FREEZE command and mmap for BPF map
> having kptr, similar to the bpf_timer case.
>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx>
> ---
>  include/linux/bpf.h     |  29 +++++++-
>  include/linux/btf.h     |   2 +
>  kernel/bpf/btf.c        | 161 ++++++++++++++++++++++++++++++++++------
>  kernel/bpf/map_in_map.c |   5 +-
>  kernel/bpf/syscall.c    | 112 +++++++++++++++++++++++++++-
>  kernel/bpf/verifier.c   | 120 ++++++++++++++++++++++++++++++
>  6 files changed, 401 insertions(+), 28 deletions(-)
>

[...]

> +static int btf_find_field_kptr(const struct btf *btf, const struct btf_type *t,
> +                              u32 off, int sz, struct btf_field_info *info)
> +{
> +       /* For PTR, sz is always == 8 */
> +       if (!btf_type_is_ptr(t))
> +               return BTF_FIELD_IGNORE;
> +       t = btf_type_by_id(btf, t->type);
> +
> +       if (!btf_type_is_type_tag(t))
> +               return BTF_FIELD_IGNORE;
> +       /* Reject extra tags */
> +       if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
> +               return -EINVAL;

Can we have tag -> const -> tag -> volatile -> tag in BTF? Wouldn't
you assume there are no more tags with just this check?

> +       if (strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
> +               return -EINVAL;
> +
> +       /* Get the base type */
> +       if (btf_type_is_modifier(t))
> +               t = btf_type_skip_modifiers(btf, t->type, NULL);
> +       /* Only pointer to struct is allowed */
> +       if (!__btf_type_is_struct(t))
> +               return -EINVAL;
> +
> +       info->type = t;
> +       info->off = off;
> +       return BTF_FIELD_FOUND;
>  }
>
>  static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
>                                  const char *name, int sz, int align, int field_type,
> -                                struct btf_field_info *info)
> +                                struct btf_field_info *info, int info_cnt)
>  {
>         const struct btf_member *member;
> +       int ret, idx = 0;
>         u32 i, off;
> -       int ret;
>
>         for_each_member(i, t, member) {
>                 const struct btf_type *member_type = btf_type_by_id(btf,
> @@ -3210,24 +3242,35 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
>                 switch (field_type) {
>                 case BTF_FIELD_SPIN_LOCK:
>                 case BTF_FIELD_TIMER:
> -                       ret = btf_find_field_struct(btf, member_type, off, sz, info);
> +                       ret = btf_find_field_struct(btf, member_type, off, sz, &info[idx]);
> +                       if (ret < 0)
> +                               return ret;
> +                       break;
> +               case BTF_FIELD_KPTR:
> +                       ret = btf_find_field_kptr(btf, member_type, off, sz, &info[idx]);
>                         if (ret < 0)
>                                 return ret;
>                         break;
>                 default:
>                         return -EFAULT;
>                 }
> +
> +               if (ret == BTF_FIELD_FOUND && idx >= info_cnt)

hm.. haven't you already written info[info_cnt] above by now? I see
that above you do (info_cnt - 1), but why such tricks if you can have
a temporary struct btf_field_info on the stack, write into it, and if
BTF_FIELD_FOUND and idx < info_cnt then write it into info[idx]?

> +                       return -E2BIG;
> +               else if (ret == BTF_FIELD_IGNORE)
> +                       continue;
> +               ++idx;
>         }
> -       return 0;
> +       return idx;
>  }
>
>  static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
>                                 const char *name, int sz, int align, int field_type,
> -                               struct btf_field_info *info)
> +                               struct btf_field_info *info, int info_cnt)
>  {
>         const struct btf_var_secinfo *vsi;
> +       int ret, idx = 0;
>         u32 i, off;
> -       int ret;
>
>         for_each_vsi(i, t, vsi) {
>                 const struct btf_type *var = btf_type_by_id(btf, vsi->type);
> @@ -3245,19 +3288,30 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
>                 switch (field_type) {
>                 case BTF_FIELD_SPIN_LOCK:
>                 case BTF_FIELD_TIMER:
> -                       ret = btf_find_field_struct(btf, var_type, off, sz, info);
> +                       ret = btf_find_field_struct(btf, var_type, off, sz, &info[idx]);
> +                       if (ret < 0)
> +                               return ret;
> +                       break;
> +               case BTF_FIELD_KPTR:
> +                       ret = btf_find_field_kptr(btf, var_type, off, sz, &info[idx]);
>                         if (ret < 0)
>                                 return ret;
>                         break;
>                 default:
>                         return -EFAULT;
>                 }
> +
> +               if (ret == BTF_FIELD_FOUND && idx >= info_cnt)

same, already writing past the end of array?

> +                       return -E2BIG;
> +               if (ret == BTF_FIELD_IGNORE)
> +                       continue;
> +               ++idx;
>         }
> -       return 0;
> +       return idx;
>  }
>
>  static int btf_find_field(const struct btf *btf, const struct btf_type *t,
> -                         int field_type, struct btf_field_info *info)
> +                         int field_type, struct btf_field_info *info, int info_cnt)
>  {
>         const char *name;
>         int sz, align;
> @@ -3273,14 +3327,20 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
>                 sz = sizeof(struct bpf_timer);
>                 align = __alignof__(struct bpf_timer);
>                 break;
> +       case BTF_FIELD_KPTR:
> +               name = NULL;
> +               sz = sizeof(u64);
> +               align = __alignof__(u64);

can be 4 on 32-bit arch, is that ok?

> +               break;
>         default:
>                 return -EFAULT;
>         }
>
> +       /* The maximum allowed fields of a certain type will be info_cnt - 1 */
>         if (__btf_type_is_struct(t))
> -               return btf_find_struct_field(btf, t, name, sz, align, field_type, info);
> +               return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt - 1);

why -1, to avoid overwriting past the end of array?

>         else if (btf_type_is_datasec(t))
> -               return btf_find_datasec_var(btf, t, name, sz, align, field_type, info);
> +               return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt - 1);
>         return -EINVAL;
>  }
>
> @@ -3290,24 +3350,79 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
>   */
>  int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
>  {
> -       struct btf_field_info info = { .off = -ENOENT };
> +       /* btf_find_field requires array of size max + 1 */

ok, right, as I expected above, but see also suggestion to not have
these weird implicit expectations

> +       struct btf_field_info info_arr[2];
>         int ret;
>
> -       ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info);
> +       ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, info_arr, ARRAY_SIZE(info_arr));
>         if (ret < 0)
>                 return ret;
> -       return info.off;
> +       if (!ret)
> +               return -ENOENT;
> +       return info_arr[0].off;
>  }
>
>  int btf_find_timer(const struct btf *btf, const struct btf_type *t)
>  {
> -       struct btf_field_info info = { .off = -ENOENT };
> +       /* btf_find_field requires array of size max + 1 */
> +       struct btf_field_info info_arr[2];
>         int ret;
>
> -       ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info);
> +       ret = btf_find_field(btf, t, BTF_FIELD_TIMER, info_arr, ARRAY_SIZE(info_arr));
>         if (ret < 0)
>                 return ret;
> -       return info.off;
> +       if (!ret)
> +               return -ENOENT;
> +       return info_arr[0].off;
> +}
> +
> +struct bpf_map_value_off *btf_find_kptr(const struct btf *btf,
> +                                       const struct btf_type *t)
> +{
> +       /* btf_find_field requires array of size max + 1 */
> +       struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX + 1];
> +       struct bpf_map_value_off *tab;
> +       int ret, i, nr_off;
> +
> +       /* Revisit stack usage when bumping BPF_MAP_VALUE_OFF_MAX */
> +       BUILD_BUG_ON(BPF_MAP_VALUE_OFF_MAX != 8);

you can store u32 type_id instead of full btf_type pointer, type
looking below in the loop is cheap and won't fail

> +
> +       ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
> +       if (ret < 0)
> +               return ERR_PTR(ret);
> +       if (!ret)
> +               return NULL;
> +

[...]

> +
> +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
> +{
> +       struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
> +       bool a_has_kptr = map_value_has_kptr(map_a), b_has_kptr = map_value_has_kptr(map_b);
> +       int size;
> +
> +       if (!a_has_kptr && !b_has_kptr)
> +               return true;
> +       if ((a_has_kptr && !b_has_kptr) || (!a_has_kptr && b_has_kptr))
> +               return false;

if (a_has_kptr != b_has_kptr)
    return false;

> +       if (tab_a->nr_off != tab_b->nr_off)
> +               return false;
> +       size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
> +       return !memcmp(tab_a, tab_b, size);
> +}
> +
>  /* called from workqueue */
>  static void bpf_map_free_deferred(struct work_struct *work)
>  {
>         struct bpf_map *map = container_of(work, struct bpf_map, work);
>
>         security_bpf_map_free(map);
> +       bpf_map_free_kptr_off_tab(map);
>         bpf_map_release_memcg(map);
>         /* implementation dependent freeing */
>         map->ops->map_free(map);
> @@ -640,7 +724,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
>         int err;
>
>         if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
> -           map_value_has_timer(map))
> +           map_value_has_timer(map) || map_value_has_kptr(map))
>                 return -ENOTSUPP;
>
>         if (!(vma->vm_flags & VM_SHARED))
> @@ -820,9 +904,31 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
>                         return -EOPNOTSUPP;
>         }
>
> -       if (map->ops->map_check_btf)
> +       map->kptr_off_tab = btf_find_kptr(btf, value_type);

btf_find_kptr() is so confusingly named. It certainly can find more
than one kptr, so at least it should be btf_find_kptrs(). Combining
with Joanne's suggestion, btf_parse_kptrs() would indeed be better.

> +       if (map_value_has_kptr(map)) {
> +               if (!bpf_capable())
> +                       return -EPERM;
> +               if (map->map_flags & BPF_F_RDONLY_PROG) {
> +                       ret = -EACCES;
> +                       goto free_map_tab;
> +               }
> +               if (map->map_type != BPF_MAP_TYPE_HASH &&
> +                   map->map_type != BPF_MAP_TYPE_LRU_HASH &&
> +                   map->map_type != BPF_MAP_TYPE_ARRAY) {

what about PERCPU_ARRAY, for instance? Is there something
fundamentally wrong to support it for local storage maps?

> +                       ret = -EOPNOTSUPP;
> +                       goto free_map_tab;
> +               }
> +       }
> +
> +       if (map->ops->map_check_btf) {
>                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
> +               if (ret < 0)
> +                       goto free_map_tab;
> +       }
>
> +       return ret;
> +free_map_tab:
> +       bpf_map_free_kptr_off_tab(map);
>         return ret;
>  }
>
> @@ -1639,7 +1745,7 @@ static int map_freeze(const union bpf_attr *attr)
>                 return PTR_ERR(map);
>
>         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
> -           map_value_has_timer(map)) {
> +           map_value_has_timer(map) || map_value_has_kptr(map)) {
>                 fdput(f);
>                 return -ENOTSUPP;
>         }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 4ce9a528fb63..744b7362e52e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3507,6 +3507,94 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
>         return __check_ptr_off_reg(env, reg, regno, false);
>  }
>
> +static int map_kptr_match_type(struct bpf_verifier_env *env,
> +                              struct bpf_map_value_off_desc *off_desc,
> +                              struct bpf_reg_state *reg, u32 regno)
> +{
> +       const char *targ_name = kernel_type_name(off_desc->btf, off_desc->btf_id);
> +       const char *reg_name = "";
> +
> +       if (reg->type != PTR_TO_BTF_ID && reg->type != PTR_TO_BTF_ID_OR_NULL)

base_type(reg->type) != PTR_TO_BTF_ID ?

> +               goto bad_type;
> +
> +       if (!btf_is_kernel(reg->btf)) {
> +               verbose(env, "R%d must point to kernel BTF\n", regno);
> +               return -EINVAL;
> +       }
> +       /* We need to verify reg->type and reg->btf, before accessing reg->btf */
> +       reg_name = kernel_type_name(reg->btf, reg->btf_id);
> +
> +       if (__check_ptr_off_reg(env, reg, regno, true))
> +               return -EACCES;
> +
> +       if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
> +                                 off_desc->btf, off_desc->btf_id))
> +               goto bad_type;
> +       return 0;
> +bad_type:
> +       verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
> +               reg_type_str(env, reg->type), reg_name);
> +       verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name);

why two separate verbose calls, you can easily combine them (and they
should be output on a single line given it's a single error)

> +       return -EINVAL;
> +}
> +

[...]