From: Alexei Starovoitov <ast@xxxxxxxxxx> User space might be creating and destroying a lot of hash maps. Synchronous rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets and other map memory and may cause artificial OOM situation under stress. Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc: - remove rcu_barrier from hash map, since htab doesn't use call_rcu directly and there are no callback to wait for. - bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks. Use it to avoid barriers in fast path. - When barriers are needed copy bpf_mem_alloc into temp structure and wait for rcu barrier-s in the worker to let the rest of hash map freeing to proceed. Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx> --- include/linux/bpf_mem_alloc.h | 2 + kernel/bpf/hashtab.c | 6 +-- kernel/bpf/memalloc.c | 80 ++++++++++++++++++++++++++++------- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 653ed1584a03..3e164b8efaa9 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -3,6 +3,7 @@ #ifndef _BPF_MEM_ALLOC_H #define _BPF_MEM_ALLOC_H #include <linux/compiler_types.h> +#include <linux/workqueue.h> struct bpf_mem_cache; struct bpf_mem_caches; @@ -10,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct work_struct work; }; int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a77b9c4a4e48..0fe3f136cbbe 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1546,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map) * There is no need to synchronize_rcu() here to protect map elements. */ - /* some of free_htab_elem() callbacks for elements of this map may - * not have executed. Wait for them. + /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it + * underneath and is reponsible for waiting for callbacks to finish + * during bpf_mem_alloc_destroy(). */ - rcu_barrier(); if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 38fbd15c130a..5cc952da7d41 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -414,10 +414,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; - /* The caller has done rcu_barrier() and no progs are using this - * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for - * all remaining elements and they can be in free_by_rcu or in - * waiting_for_gp lists, so drain those lists now. + /* No progs are using this bpf_mem_cache, but htab_map_free() called + * bpf_mem_cache_free() for all remaining elements and they can be in + * free_by_rcu or in waiting_for_gp lists, so drain those lists now. */ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) free_one(c, llnode); @@ -429,42 +428,91 @@ static void drain_mem_cache(struct bpf_mem_cache *c) free_one(c, llnode); } +static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) +{ + free_percpu(ma->cache); + free_percpu(ma->caches); + ma->cache = NULL; + ma->caches = NULL; +} + +static void free_mem_alloc(struct bpf_mem_alloc *ma) +{ + /* waiting_for_gp lists was drained, but __free_rcu might + * still execute. Wait for it now before we freeing percpu caches. + */ + rcu_barrier_tasks_trace(); + rcu_barrier(); + free_mem_alloc_no_barrier(ma); +} + +static void free_mem_alloc_deferred(struct work_struct *work) +{ + struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); + + free_mem_alloc(ma); + kfree(ma); +} + +static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) +{ + struct bpf_mem_alloc *copy; + + if (!rcu_in_progress) { + /* Fast path. No callbacks are pending, hence no need to do + * rcu_barrier-s. + */ + free_mem_alloc_no_barrier(ma); + return; + } + + copy = kmalloc(sizeof(*ma), GFP_KERNEL); + if (!copy) { + /* Slow path with inline barrier-s */ + free_mem_alloc(ma); + return; + } + + /* Defer barriers into worker to let the rest of map memory to be freed */ + copy->cache = ma->cache; + ma->cache = NULL; + copy->caches = ma->caches; + ma->caches = NULL; + INIT_WORK(©->work, free_mem_alloc_deferred); + queue_work(system_unbound_wq, ©->work); +} + void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) { struct bpf_mem_caches *cc; struct bpf_mem_cache *c; - int cpu, i; + int cpu, i, rcu_in_progress; if (ma->cache) { + rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ if (c->objcg) obj_cgroup_put(c->objcg); - /* c->waiting_for_gp list was drained, but __free_rcu might - * still execute. Wait for it now before we free 'c'. - */ - rcu_barrier_tasks_trace(); - rcu_barrier(); - free_percpu(ma->cache); - ma->cache = NULL; + destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { + rcu_in_progress = 0; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } if (c->objcg) obj_cgroup_put(c->objcg); - rcu_barrier_tasks_trace(); - rcu_barrier(); - free_percpu(ma->caches); - ma->caches = NULL; + destroy_mem_alloc(ma, rcu_in_progress); } } -- 2.30.2