[PATCH bpf-next v2 5/8] bpf: Add bpf_rcu_read_lock() verifier support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



To simplify the design and support the common practice, no
nested bpf_rcu_read_lock() is allowed. During verification,
each paired bpf_rcu_read_lock()/unlock() has a unique
region id, starting from 1. Each rcu ptr register also
remembers the region id when the ptr reg is initialized.
The following is a simple example to illustrate the
rcu lock regions and usage of rcu ptr's.

     ...                    <=== rcu lock region 0
     bpf_rcu_read_lock()    <=== rcu lock region 1
     rcu_ptr1 = ...         <=== rcu_ptr1 with region 1
     ... using rcu_ptr1 ...
     bpf_rcu_read_unlock()
     ...                    <=== rcu lock region -1
     bpf_rcu_read_lock()    <=== rcu lock region 2
     rcu_ptr2 = ...         <=== rcu_ptr2 with region 2
     ... using rcu_ptr2 ...
     ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
     bpf_rcu_read_unlock()

Outside the rcu lock region, the rcu lock region id is 0 or negative of
previous valid rcu lock region id, so the next valid rcu lock region
id can be easily computed.

Note that rcu protection is not needed for non-sleepable program. But
it is supported to make cross-sleepable/nonsleepable development easier.
For non-sleepable program, the following insns can be inside the rcu
lock region:
  - any non call insns except BPF_ABS/BPF_IND
  - non sleepable helpers or kfuncs
Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
allocation flag) should be GFP_ATOMIC.

If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
this pointer and the load which gets this pointer needs to be
protected by bpf_rcu_read_lock(). The following shows a couple
of examples:
  struct task_struct {
        ...
        struct task_struct __rcu        *real_parent;
        struct css_set __rcu            *cgroups;
        ...
  };
  struct css_set {
        ...
        struct cgroup *dfl_cgrp;
        ...
  }
  ...
  task = bpf_get_current_task_btf();
  cgroups = task->cgroups;
  dfl_cgroup = cgroups->dfl_cgrp;
  ... using dfl_cgroup ...

The bpf_rcu_read_lock/unlock() should be added like below to
avoid verification failures.
  task = bpf_get_current_task_btf();
  bpf_rcu_read_lock();
  cgroups = task->cgroups;
  dfl_cgroup = cgroups->dfl_cgrp;
  bpf_rcu_read_unlock();
  ... using dfl_cgroup ...

The following is another example for task->real_parent.
  task = bpf_get_current_task_btf();
  bpf_rcu_read_lock();
  real_parent = task->real_parent;
  ... bpf_task_storage_get(&map, real_parent, 0, 0);
  bpf_rcu_read_unlock();

Signed-off-by: Yonghong Song <yhs@xxxxxx>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  7 +++
 kernel/bpf/btf.c             | 32 ++++++++++++-
 kernel/bpf/verifier.c        | 92 +++++++++++++++++++++++++++++++-----
 4 files changed, 120 insertions(+), 12 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4bbcafd1c9b..98af0c9ec721 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -761,6 +761,7 @@ struct bpf_prog_ops {
 struct btf_struct_access_info {
 	u32 next_btf_id;
 	enum bpf_type_flag flag;
+	bool is_rcu;
 };
 
 struct bpf_verifier_ops {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1a32baa78ce2..5d703637bb12 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -179,6 +179,10 @@ struct bpf_reg_state {
 	 */
 	s32 subreg_def;
 	enum bpf_reg_liveness live;
+	/* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
+	 * the rcu ptr reg is initialized.
+	 */
+	int active_rcu_lock;
 	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
 	bool precise;
 };
@@ -324,6 +328,8 @@ struct bpf_verifier_state {
 	u32 insn_idx;
 	u32 curframe;
 	u32 active_spin_lock;
+	/* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
+	int active_rcu_lock;
 	bool speculative;
 
 	/* first and last insn idx of this verifier state */
@@ -424,6 +430,7 @@ struct bpf_insn_aux_data {
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
+	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d2ee1669a2f3..c5a9569f2ae0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 		if (btf_type_is_ptr(mtype)) {
 			const struct btf_type *stype, *t;
 			enum bpf_type_flag tmp_flag = 0;
+			bool is_rcu = false;
 			u32 id;
 
 			if (msize != size || off != moff) {
@@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 				/* check __percpu tag */
 				if (strcmp(tag_value, "percpu") == 0)
 					tmp_flag = MEM_PERCPU;
+				/* check __rcu tag */
+				if (strcmp(tag_value, "rcu") == 0)
+					is_rcu = true;
 			}
 
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
 			if (btf_type_is_struct(stype)) {
 				info->next_btf_id = id;
 				info->flag = tmp_flag;
+				info->is_rcu = is_rcu;
 				return WALK_PTR;
 			}
 		}
@@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	bool rel = false, kptr_get = false, trusted_args = false;
-	bool sleepable = false;
+	bool sleepable = false, rcu_lock = false, rcu_unlock = false;
 	struct bpf_verifier_log *log = &env->log;
 	u32 i, nargs, ref_id, ref_obj_id = 0;
 	bool is_kfunc = btf_is_kernel(btf);
@@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
 		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
 		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
+		rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
+		rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
+	}
+
+	/* checking rcu read lock/unlock */
+	if (env->cur_state->active_rcu_lock > 0) {
+		if (rcu_lock) {
+			bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
+			return -EINVAL;
+		} else if (rcu_unlock) {
+			/* change active_rcu_lock to its corresponding negative value to
+			 * preserve the previous lock region id.
+			 */
+			env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
+		} else if (sleepable) {
+			bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
+				func_name);
+			return -EINVAL;
+		}
+	} else if (rcu_lock) {
+		/* a new lock region started, increase the region id. */
+		env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
+	} else if (rcu_unlock) {
+		bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+		return -EINVAL;
 	}
 
 	/* check that BTF function arguments match actual types that the
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4d50f9568245..85151f2a655a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@
 #include <linux/error-injection.h>
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
+#include <linux/trace_events.h>
 #include <linux/poison.h>
 
 #include "disasm.h"
@@ -513,6 +514,14 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
 	       func_id == BPF_FUNC_user_ringbuf_drain;
 }
 
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+	return func_id == BPF_FUNC_sk_storage_get ||
+	       func_id == BPF_FUNC_inode_storage_get ||
+	       func_id == BPF_FUNC_task_storage_get ||
+	       func_id == BPF_FUNC_cgrp_storage_get;
+}
+
 static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
 					const struct bpf_map *map)
 {
@@ -1203,6 +1212,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
 	dst_state->active_spin_lock = src->active_spin_lock;
+	dst_state->active_rcu_lock = src->active_rcu_lock;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
@@ -1687,6 +1697,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
 	reg->var_off = tnum_unknown;
 	reg->frameno = 0;
 	reg->precise = !env->bpf_capable;
+	reg->active_rcu_lock = 0;
 	__mark_reg_unbounded(reg);
 }
 
@@ -1727,7 +1738,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *regs, u32 regno,
 			    enum bpf_reg_type reg_type,
 			    struct btf *btf, u32 btf_id,
-			    enum bpf_type_flag flag)
+			    enum bpf_type_flag flag, bool set_rcu_lock)
 {
 	if (reg_type == SCALAR_VALUE) {
 		mark_reg_unknown(env, regs, regno);
@@ -1737,6 +1748,9 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 	regs[regno].type = PTR_TO_BTF_ID | flag;
 	regs[regno].btf = btf;
 	regs[regno].btf_id = btf_id;
+	/* the reg rcu lock region id equals the current rcu lock region id */
+	if (set_rcu_lock)
+		regs[regno].active_rcu_lock = env->cur_state->active_rcu_lock;
 }
 
 #define DEF_NOT_SUBREG	(0)
@@ -3940,7 +3954,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED, false);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -4681,6 +4695,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	/* If reg valid rcu region id does not equal to the current rcu region id,
+	 * rcu access is not protected properly, either out of a valid rcu region,
+	 * or in a different rcu region.
+	 */
+	if (env->prog->aux->sleepable && reg->active_rcu_lock &&
+	    reg->active_rcu_lock != env->cur_state->active_rcu_lock) {
+		verbose(env,
+			"R%d is ptr_%s access rcu-protected memory with off=%d, not rcu protected\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
 	if (env->ops->btf_struct_access) {
 		ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
 						  off, size, atype, &info);
@@ -4697,6 +4723,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (ret < 0)
 		return ret;
 
+	/* The value is a rcu pointer. For a sleepable program, the load needs to be
+	 * in a rcu lock region, similar to rcu_dereference().
+	 */
+	if (info.is_rcu && env->prog->aux->sleepable && env->cur_state->active_rcu_lock <= 0) {
+		verbose(env,
+			"R%d is rcu dereference ptr_%s with off=%d, not in rcu_read_lock region\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
 	/* If this is an untrusted pointer, all pointers formed by walking it
 	 * also inherit the untrusted flag.
 	 */
@@ -4705,7 +4741,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, info.next_btf_id,
-				info.flag);
+				info.flag, info.is_rcu && env->prog->aux->sleepable);
 
 	return 0;
 }
@@ -4761,7 +4797,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 
 	if (value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, info.next_btf_id,
-				info.flag);
+				info.flag, info.is_rcu && env->prog->aux->sleepable);
 
 	return 0;
 }
@@ -5874,6 +5910,17 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type & PTR_MAYBE_NULL)
 		type &= ~PTR_MAYBE_NULL;
 
+	/* If a rcu pointer is a helper argument, the helper should be protected in
+	 * the same rcu lock region where the rcu pointer reg is initialized.
+	 */
+	if (env->prog->aux->sleepable && reg->active_rcu_lock &&
+	    reg->active_rcu_lock != env->cur_state->active_rcu_lock) {
+		verbose(env,
+			"R%d is arg type %s needs rcu protection\n",
+			regno, reg_type_str(env, reg->type));
+		return -EACCES;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
 		expected = compatible->types[i];
 		if (expected == NOT_INIT)
@@ -7418,6 +7465,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return err;
 	}
 
+	if (env->cur_state->active_rcu_lock > 0) {
+		if (bpf_lsm_sleepable_func_proto(func_id) ||
+		    bpf_tracing_sleepable_func_proto(func_id)) {
+			verbose(env, "sleepable helper %s#%din rcu_read_lock region\n",
+				func_id_name(func_id), func_id);
+			return -EINVAL;
+		}
+
+		if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+	}
+
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -10605,6 +10664,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
+	if (env->prog->aux->sleepable && env->cur_state->active_rcu_lock > 0) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+		return -EINVAL;
+	}
+
 	if (regs[ctx_reg].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -11869,6 +11933,9 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->active_spin_lock != cur->active_spin_lock)
 		return false;
 
+	if (old->active_rcu_lock != cur->active_rcu_lock)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -12553,6 +12620,11 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_rcu_lock > 0) {
+					verbose(env, "bpf_rcu_read_unlock is missing\n");
+					return -EINVAL;
+				}
+
 				/* We must do check_reference_leak here before
 				 * prepare_func_exit to handle the case when
 				 * state->curframe > 0, it may be a callback
@@ -14289,14 +14361,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
-		if (insn->imm == BPF_FUNC_task_storage_get ||
-		    insn->imm == BPF_FUNC_sk_storage_get ||
-		    insn->imm == BPF_FUNC_inode_storage_get ||
-		    insn->imm == BPF_FUNC_cgrp_storage_get) {
-			if (env->prog->aux->sleepable)
-				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
-			else
+		if (is_storage_get_function(insn->imm)) {
+			if (!env->prog->aux->sleepable ||
+			    env->insn_aux_data[i + delta].storage_get_func_atomic)
 				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+			else
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
 			insn_buf[1] = *insn;
 			cnt = 2;
 
-- 
2.30.2





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux