Re: [PATCH bpf-next v9 3/4] bpf: Add kfunc bpf_rcu_read_lock/unlock()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Nov 22, 2022 at 8:54 PM Yonghong Song <yhs@xxxxxx> wrote:
>
> Add two kfunc's bpf_rcu_read_lock() and bpf_rcu_read_unlock(). These two kfunc's
> can be used for all program types. The following is an example about how
> rcu pointer are used w.r.t. bpf_rcu_read_lock()/bpf_rcu_read_unlock().
>
>   struct task_struct {
>     ...
>     struct task_struct              *last_wakee;
>     struct task_struct __rcu        *real_parent;
>     ...
>   };
>
> Let us say prog does 'task = bpf_get_current_task_btf()' to get a
> 'task' pointer. The basic rules are:
>   - 'real_parent = task->real_parent' should be inside bpf_rcu_read_lock
>     region.  this is to simulate rcu_dereference() operation. The
>     'real_parent' is marked as MEM_RCU only if (1). task->real_parent is
>     inside bpf_rcu_read_lock region, and (2). task is a trusted ptr. So
>     MEM_RCU marked ptr can be 'trusted' inside the bpf_rcu_read_lock region.
>   - 'last_wakee = real_parent->last_wakee' should be inside bpf_rcu_read_lock
>     region since it tries to access rcu protected memory.
>   - the ptr 'last_wakee' will be marked as PTR_UNTRUSTED since in general
>     it is not clear whether the object pointed by 'last_wakee' is valid or
>     not even inside bpf_rcu_read_lock region.
>
> To prevent rcu pointer leaks outside the rcu read lock region.
> The verifier will clear all rcu pointer register state to unknown, i.e.,
> scalar_value, at bpf_rcu_read_unlock() kfunc call site,
> so later dereference becomes impossible.
>
> The current implementation does not support nested rcu read lock
> region in the prog.
>
> Signed-off-by: Yonghong Song <yhs@xxxxxx>
> ---
>  include/linux/bpf.h          |   3 +
>  include/linux/bpf_verifier.h |   4 +-
>  kernel/bpf/btf.c             |   3 +
>  kernel/bpf/helpers.c         |  12 +++
>  kernel/bpf/verifier.c        | 155 ++++++++++++++++++++++++++++-------
>  5 files changed, 147 insertions(+), 30 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 43fd7eeeeabb..c6aa6912ea16 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -572,6 +572,9 @@ enum bpf_type_flag {
>          */
>         PTR_TRUSTED             = BIT(12 + BPF_BASE_TYPE_BITS),
>
> +       /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
> +       MEM_RCU                 = BIT(13 + BPF_BASE_TYPE_BITS),
> +
>         __BPF_TYPE_FLAG_MAX,
>         __BPF_TYPE_LAST_FLAG    = __BPF_TYPE_FLAG_MAX - 1,
>  };
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 545152ac136c..1f3ce54e50ed 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -344,6 +344,7 @@ struct bpf_verifier_state {
>                 u32 id;
>         } active_lock;
>         bool speculative;
> +       bool active_rcu_lock;
>
>         /* first and last insn idx of this verifier state */
>         u32 first_insn_idx;
> @@ -445,6 +446,7 @@ struct bpf_insn_aux_data {
>         u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
>         bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
>         bool zext_dst; /* this insn zero extends dst reg */
> +       bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
>         u8 alu_state; /* used in combination with alu_limit */
>
>         /* below fields are initialized once */
> @@ -680,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
>         }
>  }
>
> -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED)
> +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED)
>
>  static inline bool bpf_type_has_unsafe_modifiers(u32 type)
>  {
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 1a59cc7ad730..68df0df27302 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -6237,6 +6237,9 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>                                 /* check __percpu tag */
>                                 if (strcmp(tag_value, "percpu") == 0)
>                                         tmp_flag = MEM_PERCPU;
> +                               /* check __rcu tag */
> +                               if (strcmp(tag_value, "rcu") == 0)
> +                                       tmp_flag = MEM_RCU;
>                         }
>
>                         stype = btf_type_skip_modifiers(btf, mtype->type, &id);
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index ae565b495f3d..eda9824694bf 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1971,6 +1971,16 @@ void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
>         return obj__ign;
>  }
>
> +void bpf_rcu_read_lock(void)
> +{
> +       rcu_read_lock();
> +}
> +
> +void bpf_rcu_read_unlock(void)
> +{
> +       rcu_read_unlock();
> +}
> +

I think the check from selftest:

/* rcu_tag_btf_id < 0 implies rcu tag support not available in vmlinux btf */
rcu_tag_btf_id = btf__find_by_name_kind(vmlinux_btf, "rcu", BTF_KIND_TYPE_TAG);

should be done by the kernel as well.
And if the kernel is not compiled with clang the verifier
should probably disallow bpf_rcu_read_lock/unlock kfuncs.

Otherwise the same bpf prog will work differently
depending on whether the kernel was compiled with gcc or clang.

>  __diag_pop();
>
>  BTF_SET8_START(generic_btf_ids)
> @@ -2012,6 +2022,8 @@ BTF_ID(func, bpf_cgroup_release)
>  BTF_SET8_START(common_btf_ids)
>  BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
>  BTF_ID_FLAGS(func, bpf_rdonly_cast)
> +BTF_ID_FLAGS(func, bpf_rcu_read_lock)
> +BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
>  BTF_SET8_END(common_btf_ids)
>
>  static const struct btf_kfunc_id_set common_kfunc_set = {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 068cc885903c..f76c341fea82 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -527,6 +527,14 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
>                func_id == BPF_FUNC_user_ringbuf_drain;
>  }
>
> +static bool is_storage_get_function(enum bpf_func_id func_id)
> +{
> +       return func_id == BPF_FUNC_sk_storage_get ||
> +              func_id == BPF_FUNC_inode_storage_get ||
> +              func_id == BPF_FUNC_task_storage_get ||
> +              func_id == BPF_FUNC_cgrp_storage_get;
> +}
> +
>  static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
>                                         const struct bpf_map *map)
>  {
> @@ -589,11 +597,12 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
>                         strncpy(postfix, "_or_null", 16);
>         }
>
> -       snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s",
> +       snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
>                  type & MEM_RDONLY ? "rdonly_" : "",
>                  type & MEM_RINGBUF ? "ringbuf_" : "",
>                  type & MEM_USER ? "user_" : "",
>                  type & MEM_PERCPU ? "percpu_" : "",
> +                type & MEM_RCU ? "rcu_" : "",
>                  type & PTR_UNTRUSTED ? "untrusted_" : "",
>                  type & PTR_TRUSTED ? "trusted_" : ""
>         );
> @@ -1220,6 +1229,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
>                 dst_state->frame[i] = NULL;
>         }
>         dst_state->speculative = src->speculative;
> +       dst_state->active_rcu_lock = src->active_rcu_lock;
>         dst_state->curframe = src->curframe;
>         dst_state->active_lock.ptr = src->active_lock.ptr;
>         dst_state->active_lock.id = src->active_lock.id;
> @@ -4258,6 +4268,25 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
>         return reg->type == PTR_TO_FLOW_KEYS;
>  }
>
> +static bool is_trusted_reg(const struct bpf_reg_state *reg)
> +{
> +       /* A referenced register is always trusted. */
> +       if (reg->ref_obj_id)
> +               return true;
> +
> +       /* If a register is not referenced, it is trusted if it has either the
> +        * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the

The comment needs to be adjusted.

> +        * other type modifiers may be safe, but we elect to take an opt-in
> +        * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
> +        * not.
> +        *
> +        * Eventually, we should make PTR_TRUSTED the single source of truth
> +        * for whether a register is trusted.
> +        */
> +       return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
> +              !bpf_type_has_unsafe_modifiers(reg->type);
> +}
> +
>  static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
>                                    const struct bpf_reg_state *reg,
>                                    int off, int size, bool strict)
> @@ -4737,9 +4766,29 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
>         if (type_flag(reg->type) & PTR_UNTRUSTED)
>                 flag |= PTR_UNTRUSTED;
>
> -       /* Any pointer obtained from walking a trusted pointer is no longer trusted. */
> +       /* By default any pointer obtained from walking a trusted pointer is
> +        * no longer trusted except the rcu case below.
> +        */
>         flag &= ~PTR_TRUSTED;
>
> +       if (flag & MEM_RCU) {
> +               /* Mark value register as MEM_RCU only if it is protected by
> +                * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU
> +                * itself can already indicate trustedness inside the rcu
> +                * read lock region. But Mark it as PTR_TRUSTED as well
> +                * similar to MEM_ALLOC.

'similar to MEM_ALLOC' part is not true yet.
Let's not get ahead of ourselves :)

> +                */
> +               if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg))
> +                       flag &= ~MEM_RCU;
> +               else
> +                       flag |= PTR_TRUSTED;
> +       } else if (reg->type & MEM_RCU) {
> +               /* ptr (reg) is marked as MEM_RCU, but value reg is not marked
> +                * as MEM_RCU. Mark the value reg as PTR_UNTRUSTED conservatively.
> +                */
> +               flag |= PTR_UNTRUSTED;

The part about 'value reg' doesn't look correct.
This part of the code has no idea about 'value reg' yet.
We just checked 'flag & MEM_RCU', so it's the flag that doesn't have
MEM_RCU set which means that the field of the structure we're
dereferencing doesn't have __rcu tag.
I think it's better to adjust this comment.

> +       }
> +
>         if (atype == BPF_READ && value_regno >= 0)
>                 mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
>
> @@ -5897,6 +5946,7 @@ static const struct bpf_reg_types btf_ptr_types = {
>         .types = {
>                 PTR_TO_BTF_ID,
>                 PTR_TO_BTF_ID | PTR_TRUSTED,
> +               PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED,
>         },
>  };
>  static const struct bpf_reg_types percpu_btf_ptr_types = {
> @@ -6075,6 +6125,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>         case PTR_TO_BTF_ID:
>         case PTR_TO_BTF_ID | MEM_ALLOC:
>         case PTR_TO_BTF_ID | PTR_TRUSTED:
> +       case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED:
>         case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
>                 /* When referenced PTR_TO_BTF_ID is passed to release function,
>                  * it's fixed offset must be 0. In the other cases, fixed offset
> @@ -7539,6 +7590,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>                 return err;
>         }
>
> +       if (env->cur_state->active_rcu_lock) {
> +               if (fn->might_sleep) {
> +                       verbose(env, "sleepable helper %s#%din rcu_read_lock region\n",
> +                               func_id_name(func_id), func_id);
> +                       return -EINVAL;
> +               }
> +
> +               if (env->prog->aux->sleepable && is_storage_get_function(func_id))
> +                       env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
> +       }
> +
>         meta.func_id = func_id;
>         /* check args */
>         for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
> @@ -7966,25 +8028,6 @@ static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
>         return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
>  }
>
> -static bool is_trusted_reg(const struct bpf_reg_state *reg)
> -{
> -       /* A referenced register is always trusted. */
> -       if (reg->ref_obj_id)
> -               return true;
> -
> -       /* If a register is not referenced, it is trusted if it has either the
> -        * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
> -        * other type modifiers may be safe, but we elect to take an opt-in
> -        * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
> -        * not.
> -        *
> -        * Eventually, we should make PTR_TRUSTED the single source of truth
> -        * for whether a register is trusted.
> -        */
> -       return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
> -              !bpf_type_has_unsafe_modifiers(reg->type);
> -}
> -
>  static bool __kfunc_param_match_suffix(const struct btf *btf,
>                                        const struct btf_param *arg,
>                                        const char *suffix)
> @@ -8163,6 +8206,8 @@ enum special_kfunc_type {
>         KF_bpf_list_pop_back,
>         KF_bpf_cast_to_kern_ctx,
>         KF_bpf_rdonly_cast,
> +       KF_bpf_rcu_read_lock,
> +       KF_bpf_rcu_read_unlock,
>  };
>
>  BTF_SET_START(special_kfunc_set)
> @@ -8185,6 +8230,18 @@ BTF_ID(func, bpf_list_pop_front)
>  BTF_ID(func, bpf_list_pop_back)
>  BTF_ID(func, bpf_cast_to_kern_ctx)
>  BTF_ID(func, bpf_rdonly_cast)
> +BTF_ID(func, bpf_rcu_read_lock)
> +BTF_ID(func, bpf_rcu_read_unlock)
> +
> +static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
> +{
> +       return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
> +}
> +
> +static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
> +{
> +       return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
> +}
>
>  static enum kfunc_ptr_arg_type
>  get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
> @@ -8817,6 +8874,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>         const struct btf_type *t, *func, *func_proto, *ptr_type;
>         struct bpf_reg_state *regs = cur_regs(env);
>         const char *func_name, *ptr_type_name;
> +       bool sleepable, rcu_lock, rcu_unlock;
>         struct bpf_kfunc_call_arg_meta meta;
>         u32 i, nargs, func_id, ptr_type_id;
>         int err, insn_idx = *insn_idx_p;
> @@ -8858,11 +8916,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>                 return -EACCES;
>         }
>
> -       if (is_kfunc_sleepable(&meta) && !env->prog->aux->sleepable) {
> +       sleepable = is_kfunc_sleepable(&meta);
> +       if (sleepable && !env->prog->aux->sleepable) {
>                 verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
>                 return -EACCES;
>         }
>
> +       rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
> +       rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
> +       if (env->cur_state->active_rcu_lock) {
> +               struct bpf_func_state *state;
> +               struct bpf_reg_state *reg;
> +
> +               if (rcu_lock) {
> +                       verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
> +                       return -EINVAL;
> +               } else if (rcu_unlock) {
> +                       bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
> +                               if (reg->type & MEM_RCU)
> +                                       __mark_reg_unknown(env, reg);
> +                       }));

That feels too drastic.
rcu_unlock will mark all pointers as scalar,
but the prog can still do bpf_rdonly_cast and read them.
Why force the prog to jump through such hoops?
Are we trying to prevent some kind of programming mistake?

Maybe clear MEM_RCU flag here and add PTR_UNTRUSTED instead?

> +                       env->cur_state->active_rcu_lock = false;
> +               } else if (sleepable) {
> +                       verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
> +                       return -EACCES;
> +               }
> +       } else if (rcu_lock) {
> +               env->cur_state->active_rcu_lock = true;
> +       } else if (rcu_unlock) {
> +               verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
> +               return -EINVAL;
> +       }
> +
>         /* Check the arguments */
>         err = check_kfunc_args(env, &meta);
>         if (err < 0)
> @@ -11754,6 +11839,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
>                 return -EINVAL;
>         }
>
> +       if (env->cur_state->active_rcu_lock) {
> +               verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
> +               return -EINVAL;
> +       }
> +
>         if (regs[ctx_reg].type != PTR_TO_CTX) {
>                 verbose(env,
>                         "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
> @@ -13019,6 +13109,9 @@ static bool states_equal(struct bpf_verifier_env *env,
>             old->active_lock.id != cur->active_lock.id)
>                 return false;
>
> +       if (old->active_rcu_lock != cur->active_rcu_lock)
> +               return false;
> +
>         /* for states to be equal callsites have to be the same
>          * and all frame states need to be equivalent
>          */
> @@ -13706,6 +13799,11 @@ static int do_check(struct bpf_verifier_env *env)
>                                         return -EINVAL;
>                                 }
>
> +                               if (env->cur_state->active_rcu_lock) {
> +                                       verbose(env, "bpf_rcu_read_unlock is missing\n");
> +                                       return -EINVAL;
> +                               }
> +
>                                 /* We must do check_reference_leak here before
>                                  * prepare_func_exit to handle the case when
>                                  * state->curframe > 0, it may be a callback
> @@ -14802,6 +14900,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>                 case PTR_TO_BTF_ID:
>                 case PTR_TO_BTF_ID | PTR_UNTRUSTED:
>                 case PTR_TO_BTF_ID | PTR_TRUSTED:
> +               case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED:
>                 /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
>                  * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
>                  * be said once it is marked PTR_UNTRUSTED, hence we must handle

wait a sec.
Why are we converting PTR_TRUSTED, MEM_RCU, MEM_ALLOC pointers into
BPF_PROBE_MEM ?
The mistake slipped in earlier, but let's fix it first.

BPF_REG_TRUSTED_MODIFIERS should stay as normal LDX.

> @@ -15494,14 +15593,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         goto patch_call_imm;
>                 }
>
> -               if (insn->imm == BPF_FUNC_task_storage_get ||
> -                   insn->imm == BPF_FUNC_sk_storage_get ||
> -                   insn->imm == BPF_FUNC_inode_storage_get ||
> -                   insn->imm == BPF_FUNC_cgrp_storage_get) {
> -                       if (env->prog->aux->sleepable)
> -                               insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
> -                       else
> +               if (is_storage_get_function(insn->imm)) {
> +                       if (!env->prog->aux->sleepable ||
> +                           env->insn_aux_data[i + delta].storage_get_func_atomic)
>                                 insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
> +                       else
> +                               insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
>                         insn_buf[1] = *insn;
>                         cnt = 2;
>
> --
> 2.30.2
>



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux