From: Hou Tao <houtao1@xxxxxxxxxx> When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-count of the inner map directly through bpf_map_put(), if the ref-count is the last ref-count (which is true for most cases), the inner map will be free by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix it by telling bpf_map_free_deferred() to wait for both one RCU grace period and one tasks trace RCU grace period before calling ops->map_free() for inner map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- include/linux/bpf.h | 1 + kernel/bpf/map_in_map.c | 11 ++++++++--- kernel/bpf/syscall.c | 5 +++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7aaf625191fca..ec3c90202ffe6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 7d5754d18fd04..cf33630655661 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (deferred) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a144eb286974b..e2d2701ce2c45 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <linux/rcupdate_wait.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -696,6 +697,10 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); bpf_map_release_memcg(map); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace); + /* implementation dependent freeing */ map->ops->map_free(map); /* Delay freeing of btf_record for maps, as map_free -- 2.29.2