[PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This work adds program tracking to prog array maps. This is needed such
that upon prog array updates/deletions we can fix up all programs which
make use of this tail call map. We add ops->map_poke_{un,}track() helpers
to maps to maintain the list of programs and ops->map_poke_run() for
triggering the actual update. bpf_array_aux is extended to contain the
list head and poke_mutex in order to serialize program patching during
updates/deletions. bpf_free_used_maps() will untrack the program shortly
before dropping the reference to the map.

The prog_array_map_poke_run() is triggered during updates/deletions and
walks the maintained prog list. It checks in their poke_tabs whether the
map and key is matching and runs the actual bpf_arch_text_poke() for
patching in the nop or new jmp location. Depending on the type of update,
we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.

Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
---
 include/linux/bpf.h   |  36 +++++++++++++
 kernel/bpf/arraymap.c | 120 +++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/core.c     |   9 +++-
 3 files changed, 162 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0ff06a0d0058..62a369fb8d98 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -21,6 +21,7 @@ struct bpf_verifier_env;
 struct bpf_verifier_log;
 struct perf_event;
 struct bpf_prog;
+struct bpf_prog_aux;
 struct bpf_map;
 struct sock;
 struct seq_file;
@@ -63,6 +64,12 @@ struct bpf_map_ops {
 			     const struct btf_type *key_type,
 			     const struct btf_type *value_type);
 
+	/* Prog poke tracking helpers. */
+	int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
+	void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
+	void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
+			     struct bpf_prog *new);
+
 	/* Direct value access helpers. */
 	int (*map_direct_value_addr)(const struct bpf_map *map,
 				     u64 *imm, u32 off);
@@ -584,6 +591,9 @@ struct bpf_array_aux {
 	 */
 	enum bpf_prog_type type;
 	bool jited;
+	/* Programs with direct jumps into programs part of this array. */
+	struct list_head poke_progs;
+	struct mutex poke_mutex;
 };
 
 struct bpf_array {
@@ -1325,4 +1335,30 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+static inline void bpf_map_poke_lock(struct bpf_map *map)
+	__acquires(&container_of(map, struct bpf_array, map)->aux->poke_mutex)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	if (aux)
+		mutex_lock(&aux->poke_mutex);
+#endif
+	__acquire(&aux->poke_mutex);
+}
+
+static inline void bpf_map_poke_unlock(struct bpf_map *map)
+	__releases(&container_of(map, struct bpf_array, map)->aux->poke_mutex)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	if (aux)
+		mutex_unlock(&aux->poke_mutex);
+#endif
+	__release(&aux->poke_mutex);
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 88c1363b2925..b9ef993278c6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -540,10 +540,14 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	if (IS_ERR(new_ptr))
 		return PTR_ERR(new_ptr);
 
+	bpf_map_poke_lock(map);
 	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (map->ops->map_poke_run)
+		map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+	bpf_map_poke_unlock(map);
+
 	if (old_ptr)
 		map->ops->map_fd_put_ptr(old_ptr);
-
 	return 0;
 }
 
@@ -556,7 +560,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
+	bpf_map_poke_lock(map);
 	old_ptr = xchg(array->ptrs + index, NULL);
+	if (map->ops->map_poke_run)
+		map->ops->map_poke_run(map, index, old_ptr, NULL);
+	bpf_map_poke_unlock(map);
+
 	if (old_ptr) {
 		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
@@ -625,6 +634,104 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+struct prog_poke_elem {
+	struct list_head list;
+	struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+				     struct bpf_prog_aux *prog_aux)
+{
+	struct bpf_array_aux *aux;
+	struct prog_poke_elem *elem;
+
+	elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+	if (!elem)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&elem->list);
+	elem->aux = prog_aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	mutex_lock(&aux->poke_mutex);
+	list_add_tail(&elem->list, &aux->poke_progs);
+	mutex_unlock(&aux->poke_mutex);
+	return 0;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+					struct bpf_prog_aux *prog_aux)
+{
+	struct prog_poke_elem *elem, *tmp;
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	mutex_lock(&aux->poke_mutex);
+	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+		if (elem->aux == prog_aux) {
+			list_del_init(&elem->list);
+			kfree(elem);
+		}
+	}
+	mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+				    struct bpf_prog *old,
+				    struct bpf_prog *new)
+{
+	enum bpf_text_poke_type type;
+	struct prog_poke_elem *elem;
+	struct bpf_array_aux *aux;
+
+	if (!old && new)
+		type = BPF_MOD_NOP_TO_JUMP;
+	else if (old && !new)
+		type = BPF_MOD_JUMP_TO_NOP;
+	else if (old && new)
+		type = BPF_MOD_JUMP_TO_JUMP;
+	else
+		return;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	list_for_each_entry(elem, &aux->poke_progs, list) {
+		struct bpf_prog *prog = elem->aux->prog;
+		struct bpf_jit_poke_descriptor *poke;
+		int i, ret;
+
+		/* The prog's kallsym entry was removed out of RCU callback,
+		 * but we can only untrack from sleepable context, therefore
+		 * bpf_arch_text_poke() might not see that this is in BPF text
+		 * section. Given these programs are unreachable, we can skip
+		 * them here. Also programs reaching refcount of zero while
+		 * patching is in progress is okay since we're protected under
+		 * poke_mutex and untrack the programs before the JIT buffer
+		 * is freed. If that happens and if also the RCU callback did
+		 * remove the kallsyms entry in the meantime, bpf_arch_text_poke()
+		 * will bail out with -EINVAL without patching, which is totally
+		 * fine. Any other error happening at that point is a bug.
+		 */
+		if (bpf_prog_under_eviction(prog))
+			continue;
+		for (i = 0; i < prog->aux->size_poke_tab; i++) {
+			poke = &prog->aux->poke_tab[i];
+			if (!READ_ONCE(poke->ip_stable))
+				continue;
+			if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+				continue;
+			if (poke->tc.map != map || poke->tc.key != key)
+				continue;
+			ret = bpf_arch_text_poke(poke->ip, type,
+						 old ? (u8 *)old->bpf_func +
+						 poke->adj_off : NULL,
+						 new ? (u8 *)new->bpf_func +
+						 poke->adj_off : NULL);
+			/* See comment above. */
+			BUG_ON(ret < 0 && ret != -EINVAL);
+		}
+	}
+}
+
 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_array_aux *aux;
@@ -634,6 +741,9 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 	if (!aux)
 		return ERR_PTR(-ENOMEM);
 
+	INIT_LIST_HEAD(&aux->poke_progs);
+	mutex_init(&aux->poke_mutex);
+
 	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
 		kfree(aux);
@@ -646,9 +756,14 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 
 static void prog_array_map_free(struct bpf_map *map)
 {
+	struct prog_poke_elem *elem, *tmp;
 	struct bpf_array_aux *aux;
 
 	aux = container_of(map, struct bpf_array, map)->aux;
+	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+		list_del_init(&elem->list);
+		kfree(elem);
+	}
 	kfree(aux);
 	fd_array_map_free(map);
 }
@@ -657,6 +772,9 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = prog_array_map_alloc,
 	.map_free = prog_array_map_free,
+	.map_poke_track = prog_array_map_poke_track,
+	.map_poke_untrack = prog_array_map_poke_untrack,
+	.map_poke_run = prog_array_map_poke_run,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 853dc4538442..39750ef3ef31 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2046,11 +2046,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
 
 static void bpf_free_used_maps(struct bpf_prog_aux *aux)
 {
+	struct bpf_map *map;
 	int i;
 
 	bpf_free_cgroup_storage(aux);
-	for (i = 0; i < aux->used_map_cnt; i++)
-		bpf_map_put(aux->used_maps[i]);
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		map = aux->used_maps[i];
+		if (map->ops->map_poke_untrack)
+			map->ops->map_poke_untrack(map, aux);
+		bpf_map_put(map);
+	}
 	kfree(aux->used_maps);
 }
 
-- 
2.21.0




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux