On 11/14/2023 11:23 AM, Yonghong Song wrote: > > On 11/13/23 4:42 AM, Hou Tao wrote: >> Hi, >> >> On 11/11/2023 9:39 AM, Yonghong Song wrote: >>> Kirill Shutemov reported significant percpu memory consumption >>> increase after >>> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add >>> support for >>> non-fix-size percpu mem allocation"). The percpu memory consumption is >>> increased from 111MB to 969MB. The number is from /proc/meminfo. >>> >>> I tried to reproduce the issue with my local VM which at most >>> supports upto >>> 255 cpus. With 252 cpus, without the above commit, the percpu memory >>> consumption immediately after boot is 57MB while with the above >>> commit the >>> percpu memory consumption is 231MB. >>> >>> This is not good since so far percpu memory from bpf memory >>> allocator is not >>> widely used yet. Let us change pre-allocation in init stage to >>> on-demand >>> allocation when verifier detects there is a need of percpu memory >>> for bpf >>> program. With this change, percpu memory consumption after boot can >>> be reduced >>> signicantly. >>> >>> [1] >>> https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@xxxxxxxxxxxxxxxxx/ >>> >>> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem >>> allocation") >>> Reported-and-tested-by: Kirill A. Shutemov >>> <kirill.shutemov@xxxxxxxxxxxxxxx> >>> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> >>> --- >>> include/linux/bpf.h | 2 +- >>> kernel/bpf/core.c | 8 +++----- >>> kernel/bpf/verifier.c | 20 ++++++++++++++++++-- >>> 3 files changed, 22 insertions(+), 8 deletions(-) >>> >>> Changelog: >>> v2 -> v3: >>> - Use dedicated mutex lock (bpf_percpu_ma_lock) >>> v1 -> v2: >>> - Add proper Reported-and-tested-by tag. >>> - Do a check of !bpf_global_percpu_ma_set before acquiring >>> verifier_lock. >>> >>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>> index 35bff17396c0..6762dac3ef76 100644 >>> --- a/include/linux/bpf.h >>> +++ b/include/linux/bpf.h >>> @@ -56,7 +56,7 @@ extern struct idr btf_idr; >>> extern spinlock_t btf_idr_lock; >>> extern struct kobject *btf_kobj; >>> extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; >>> -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; >>> +extern bool bpf_global_ma_set; >>> typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); >>> typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, >>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c >>> index 08626b519ce2..cd3afe57ece3 100644 >>> --- a/kernel/bpf/core.c >>> +++ b/kernel/bpf/core.c >>> @@ -64,8 +64,8 @@ >>> #define OFF insn->off >>> #define IMM insn->imm >>> -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; >>> -bool bpf_global_ma_set, bpf_global_percpu_ma_set; >>> +struct bpf_mem_alloc bpf_global_ma; >>> +bool bpf_global_ma_set; >>> /* No hurry in this branch >>> * >>> @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void) >>> ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); >>> bpf_global_ma_set = !ret; >>> - ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); >>> - bpf_global_percpu_ma_set = !ret; >>> - return !bpf_global_ma_set || !bpf_global_percpu_ma_set; >>> + return ret; >>> } >>> late_initcall(bpf_global_ma_init); >>> #endif >>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c >>> index a2267d5ed14e..6da370a047fe 100644 >>> --- a/kernel/bpf/verifier.c >>> +++ b/kernel/bpf/verifier.c >>> @@ -26,6 +26,7 @@ >>> #include <linux/poison.h> >>> #include <linux/module.h> >>> #include <linux/cpumask.h> >>> +#include <linux/bpf_mem_alloc.h> >>> #include <net/xdp.h> >>> #include "disasm.h" >>> @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const >>> bpf_verifier_ops[] = { >>> #undef BPF_LINK_TYPE >>> }; >>> +struct bpf_mem_alloc bpf_global_percpu_ma; >>> +static bool bpf_global_percpu_ma_set; >>> + >>> /* bpf_check() is a static code analyzer that walks eBPF program >>> * instruction by instruction and updates register/stack state. >>> * All paths of conditional branches are analyzed until 'bpf_exit' >>> insn. >>> @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta { >>> struct btf *btf_vmlinux; >>> static DEFINE_MUTEX(bpf_verifier_lock); >>> +static DEFINE_MUTEX(bpf_percpu_ma_lock); >>> static const struct bpf_line_info * >>> find_linfo(const struct bpf_verifier_env *env, u32 insn_off) >>> @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct >>> bpf_verifier_env *env, struct bpf_insn *insn, >>> if (meta.func_id == >>> special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) >>> return -ENOMEM; >>> - if (meta.func_id == >>> special_kfunc_list[KF_bpf_percpu_obj_new_impl] && >>> !bpf_global_percpu_ma_set) >>> - return -ENOMEM; >>> + if (meta.func_id == >>> special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { >>> + if (!bpf_global_percpu_ma_set) { >>> + mutex_lock(&bpf_percpu_ma_lock); >>> + if (!bpf_global_percpu_ma_set) { >>> + err = >>> bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); >>> + if (!err) >>> + bpf_global_percpu_ma_set = true; >>> + } >> A dumb question here: do we need some memory barrier to guarantee the >> memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ? > > We should be fine. There is a control dependence on '!err' for > 'bpf_global_percpu_ma_set = true'. I see. Thanks for the explanation. > >>> + mutex_unlock(&bpf_percpu_ma_lock); >>> + if (err) >>> + return err; >>> + } >>> + } >>> if (((u64)(u32)meta.arg_constant.value) != >>> meta.arg_constant.value) { >>> verbose(env, "local type ID argument must be >>> in range [0, U32_MAX]\n"); > > .