Hi, On 11/11/2023 9:39 AM, Yonghong Song wrote: > Kirill Shutemov reported significant percpu memory consumption increase after > booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for > non-fix-size percpu mem allocation"). The percpu memory consumption is > increased from 111MB to 969MB. The number is from /proc/meminfo. > > I tried to reproduce the issue with my local VM which at most supports upto > 255 cpus. With 252 cpus, without the above commit, the percpu memory > consumption immediately after boot is 57MB while with the above commit the > percpu memory consumption is 231MB. > > This is not good since so far percpu memory from bpf memory allocator is not > widely used yet. Let us change pre-allocation in init stage to on-demand > allocation when verifier detects there is a need of percpu memory for bpf > program. With this change, percpu memory consumption after boot can be reduced > signicantly. > > [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@xxxxxxxxxxxxxxxxx/ > > Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation") > Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/linux/bpf.h | 2 +- > kernel/bpf/core.c | 8 +++----- > kernel/bpf/verifier.c | 20 ++++++++++++++++++-- > 3 files changed, 22 insertions(+), 8 deletions(-) > > Changelog: > v2 -> v3: > - Use dedicated mutex lock (bpf_percpu_ma_lock) > v1 -> v2: > - Add proper Reported-and-tested-by tag. > - Do a check of !bpf_global_percpu_ma_set before acquiring verifier_lock. > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 35bff17396c0..6762dac3ef76 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -56,7 +56,7 @@ extern struct idr btf_idr; > extern spinlock_t btf_idr_lock; > extern struct kobject *btf_kobj; > extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; > -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; > +extern bool bpf_global_ma_set; > > typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); > typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 08626b519ce2..cd3afe57ece3 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -64,8 +64,8 @@ > #define OFF insn->off > #define IMM insn->imm > > -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; > -bool bpf_global_ma_set, bpf_global_percpu_ma_set; > +struct bpf_mem_alloc bpf_global_ma; > +bool bpf_global_ma_set; > > /* No hurry in this branch > * > @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void) > > ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); > bpf_global_ma_set = !ret; > - ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); > - bpf_global_percpu_ma_set = !ret; > - return !bpf_global_ma_set || !bpf_global_percpu_ma_set; > + return ret; > } > late_initcall(bpf_global_ma_init); > #endif > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index a2267d5ed14e..6da370a047fe 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -26,6 +26,7 @@ > #include <linux/poison.h> > #include <linux/module.h> > #include <linux/cpumask.h> > +#include <linux/bpf_mem_alloc.h> > #include <net/xdp.h> > > #include "disasm.h" > @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { > #undef BPF_LINK_TYPE > }; > > +struct bpf_mem_alloc bpf_global_percpu_ma; > +static bool bpf_global_percpu_ma_set; > + > /* bpf_check() is a static code analyzer that walks eBPF program > * instruction by instruction and updates register/stack state. > * All paths of conditional branches are analyzed until 'bpf_exit' insn. > @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta { > struct btf *btf_vmlinux; > > static DEFINE_MUTEX(bpf_verifier_lock); > +static DEFINE_MUTEX(bpf_percpu_ma_lock); > > static const struct bpf_line_info * > find_linfo(const struct bpf_verifier_env *env, u32 insn_off) > @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, > if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) > return -ENOMEM; > > - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set) > - return -ENOMEM; > + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { > + if (!bpf_global_percpu_ma_set) { > + mutex_lock(&bpf_percpu_ma_lock); > + if (!bpf_global_percpu_ma_set) { > + err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); > + if (!err) > + bpf_global_percpu_ma_set = true; > + } A dumb question here: do we need some memory barrier to guarantee the memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ? > + mutex_unlock(&bpf_percpu_ma_lock); > + if (err) > + return err; > + } > + } > > if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { > verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");