To simplify the design and support the common practice, no
nested bpf_rcu_read_lock() is allowed. During verification,
each paired bpf_rcu_read_lock()/unlock() has a unique
region id, starting from 1. Each rcu ptr register also
remembers the region id when the ptr reg is initialized.
The following is a simple example to illustrate the
rcu lock regions and usage of rcu ptr's.
... <=== rcu lock region 0
bpf_rcu_read_lock() <=== rcu lock region 1
rcu_ptr1 = ... <=== rcu_ptr1 with region 1
... using rcu_ptr1 ...
bpf_rcu_read_unlock()
... <=== rcu lock region -1
bpf_rcu_read_lock() <=== rcu lock region 2
rcu_ptr2 = ... <=== rcu_ptr2 with region 2
... using rcu_ptr2 ...
... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
bpf_rcu_read_unlock()
Outside the rcu lock region, the rcu lock region id is 0 or negative of
previous valid rcu lock region id, so the next valid rcu lock region
id can be easily computed.
Note that rcu protection is not needed for non-sleepable program. But
it is supported to make cross-sleepable/nonsleepable development easier.
For non-sleepable program, the following insns can be inside the rcu
lock region:
- any non call insns except BPF_ABS/BPF_IND
- non sleepable helpers or kfuncs
Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
allocation flag) should be GFP_ATOMIC.
If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
this pointer and the load which gets this pointer needs to be
protected by bpf_rcu_read_lock(). The following shows a couple
of examples:
struct task_struct {
...
struct task_struct __rcu *real_parent;
struct css_set __rcu *cgroups;
...
};
struct css_set {
...
struct cgroup *dfl_cgrp;
...
}
...
task = bpf_get_current_task_btf();
cgroups = task->cgroups;
dfl_cgroup = cgroups->dfl_cgrp;
... using dfl_cgroup ...
The bpf_rcu_read_lock/unlock() should be added like below to
avoid verification failures.
task = bpf_get_current_task_btf();
bpf_rcu_read_lock();
cgroups = task->cgroups;
dfl_cgroup = cgroups->dfl_cgrp;
bpf_rcu_read_unlock();
... using dfl_cgroup ...
The following is another example for task->real_parent.
task = bpf_get_current_task_btf();
bpf_rcu_read_lock();
real_parent = task->real_parent;
... bpf_task_storage_get(&map, real_parent, 0, 0);
bpf_rcu_read_unlock();
Signed-off-by: Yonghong Song <yhs@xxxxxx>
---
include/linux/bpf.h | 1 +
include/linux/bpf_verifier.h | 7 +++
kernel/bpf/btf.c | 32 ++++++++++++-
kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++-----
4 files changed, 120 insertions(+), 12 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4bbcafd1c9b..98af0c9ec721 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -761,6 +761,7 @@ struct bpf_prog_ops {
struct btf_struct_access_info {
u32 next_btf_id;
enum bpf_type_flag flag;
+ bool is_rcu;
};
struct bpf_verifier_ops {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1a32baa78ce2..5d703637bb12 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -179,6 +179,10 @@ struct bpf_reg_state {
*/
s32 subreg_def;
enum bpf_reg_liveness live;
+ /* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
+ * the rcu ptr reg is initialized.
+ */
+ int active_rcu_lock;
/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
bool precise;
};
@@ -324,6 +328,8 @@ struct bpf_verifier_state {
u32 insn_idx;
u32 curframe;
u32 active_spin_lock;
+ /* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
+ int active_rcu_lock;
bool speculative;
/* first and last insn idx of this verifier state */
@@ -424,6 +430,7 @@ struct bpf_insn_aux_data {
u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
bool zext_dst; /* this insn zero extends dst reg */
+ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
u8 alu_state; /* used in combination with alu_limit */
/* below fields are initialized once */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d2ee1669a2f3..c5a9569f2ae0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
if (btf_type_is_ptr(mtype)) {
const struct btf_type *stype, *t;
enum bpf_type_flag tmp_flag = 0;
+ bool is_rcu = false;
u32 id;
if (msize != size || off != moff) {
@@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
/* check __percpu tag */
if (strcmp(tag_value, "percpu") == 0)
tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ is_rcu = true;
}
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
info->next_btf_id = id;
info->flag = tmp_flag;
+ info->is_rcu = is_rcu;
return WALK_PTR;
}
}
@@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
bool rel = false, kptr_get = false, trusted_args = false;
- bool sleepable = false;
+ bool sleepable = false, rcu_lock = false, rcu_unlock = false;
struct bpf_verifier_log *log = &env->log;
u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
@@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
kptr_get = kfunc_meta->flags & KF_KPTR_GET;
trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
sleepable = kfunc_meta->flags & KF_SLEEPABLE;
+ rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
+ rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
+ }
+
+ /* checking rcu read lock/unlock */
+ if (env->cur_state->active_rcu_lock > 0) {
+ if (rcu_lock) {
+ bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ } else if (rcu_unlock) {
+ /* change active_rcu_lock to its corresponding negative value to
+ * preserve the previous lock region id.
+ */
+ env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
+ } else if (sleepable) {
+ bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
+ func_name);
+ return -EINVAL;
+ }
+ } else if (rcu_lock) {
+ /* a new lock region started, increase the region id. */
+ env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
+ } else if (rcu_unlock) {
+ bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
}