Re: [PATCH bpf-next v3] bpf: Do not allocate percpu memory at init stage

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 11/14/2023 11:23 AM, Yonghong Song wrote:
>
> On 11/13/23 4:42 AM, Hou Tao wrote:
>> Hi,
>>
>> On 11/11/2023 9:39 AM, Yonghong Song wrote:
>>> Kirill Shutemov reported significant percpu memory consumption
>>> increase after
>>> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add
>>> support for
>>> non-fix-size percpu mem allocation"). The percpu memory consumption is
>>> increased from 111MB to 969MB. The number is from /proc/meminfo.
>>>
>>> I tried to reproduce the issue with my local VM which at most
>>> supports upto
>>> 255 cpus. With 252 cpus, without the above commit, the percpu memory
>>> consumption immediately after boot is 57MB while with the above
>>> commit the
>>> percpu memory consumption is 231MB.
>>>
>>> This is not good since so far percpu memory from bpf memory
>>> allocator is not
>>> widely used yet. Let us change pre-allocation in init stage to
>>> on-demand
>>> allocation when verifier detects there is a need of percpu memory
>>> for bpf
>>> program. With this change, percpu memory consumption after boot can
>>> be reduced
>>> signicantly.
>>>
>>>    [1]
>>> https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@xxxxxxxxxxxxxxxxx/
>>>
>>> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem
>>> allocation")
>>> Reported-and-tested-by: Kirill A. Shutemov
>>> <kirill.shutemov@xxxxxxxxxxxxxxx>
>>> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx>
>>> ---
>>>   include/linux/bpf.h   |  2 +-
>>>   kernel/bpf/core.c     |  8 +++-----
>>>   kernel/bpf/verifier.c | 20 ++++++++++++++++++--
>>>   3 files changed, 22 insertions(+), 8 deletions(-)
>>>
>>> Changelog:
>>>    v2 -> v3:
>>>      - Use dedicated mutex lock (bpf_percpu_ma_lock)
>>>    v1 -> v2:
>>>      - Add proper Reported-and-tested-by tag.
>>>      - Do a check of !bpf_global_percpu_ma_set before acquiring
>>> verifier_lock.
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 35bff17396c0..6762dac3ef76 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -56,7 +56,7 @@ extern struct idr btf_idr;
>>>   extern spinlock_t btf_idr_lock;
>>>   extern struct kobject *btf_kobj;
>>>   extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>>> -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>>> +extern bool bpf_global_ma_set;
>>>     typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>>>   typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
>>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>>> index 08626b519ce2..cd3afe57ece3 100644
>>> --- a/kernel/bpf/core.c
>>> +++ b/kernel/bpf/core.c
>>> @@ -64,8 +64,8 @@
>>>   #define OFF    insn->off
>>>   #define IMM    insn->imm
>>>   -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>>> -bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>>> +struct bpf_mem_alloc bpf_global_ma;
>>> +bool bpf_global_ma_set;
>>>     /* No hurry in this branch
>>>    *
>>> @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
>>>         ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>>>       bpf_global_ma_set = !ret;
>>> -    ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>>> -    bpf_global_percpu_ma_set = !ret;
>>> -    return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
>>> +    return ret;
>>>   }
>>>   late_initcall(bpf_global_ma_init);
>>>   #endif
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index a2267d5ed14e..6da370a047fe 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -26,6 +26,7 @@
>>>   #include <linux/poison.h>
>>>   #include <linux/module.h>
>>>   #include <linux/cpumask.h>
>>> +#include <linux/bpf_mem_alloc.h>
>>>   #include <net/xdp.h>
>>>     #include "disasm.h"
>>> @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const
>>> bpf_verifier_ops[] = {
>>>   #undef BPF_LINK_TYPE
>>>   };
>>>   +struct bpf_mem_alloc bpf_global_percpu_ma;
>>> +static bool bpf_global_percpu_ma_set;
>>> +
>>>   /* bpf_check() is a static code analyzer that walks eBPF program
>>>    * instruction by instruction and updates register/stack state.
>>>    * All paths of conditional branches are analyzed until 'bpf_exit'
>>> insn.
>>> @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
>>>   struct btf *btf_vmlinux;
>>>     static DEFINE_MUTEX(bpf_verifier_lock);
>>> +static DEFINE_MUTEX(bpf_percpu_ma_lock);
>>>     static const struct bpf_line_info *
>>>   find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
>>> @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct
>>> bpf_verifier_env *env, struct bpf_insn *insn,
>>>                   if (meta.func_id ==
>>> special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
>>>                       return -ENOMEM;
>>>   -                if (meta.func_id ==
>>> special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
>>> !bpf_global_percpu_ma_set)
>>> -                    return -ENOMEM;
>>> +                if (meta.func_id ==
>>> special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
>>> +                    if (!bpf_global_percpu_ma_set) {
>>> +                        mutex_lock(&bpf_percpu_ma_lock);
>>> +                        if (!bpf_global_percpu_ma_set) {
>>> +                            err =
>>> bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>>> +                            if (!err)
>>> +                                bpf_global_percpu_ma_set = true;
>>> +                        }
>> A dumb question here: do we need some memory barrier to guarantee the
>> memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ?
>
> We should be fine. There is a control dependence on '!err' for
> 'bpf_global_percpu_ma_set = true'.

I see. Thanks for the explanation.
>
>>> +                        mutex_unlock(&bpf_percpu_ma_lock);
>>> +                        if (err)
>>> +                            return err;
>>> +                    }
>>> +                }
>>>                     if (((u64)(u32)meta.arg_constant.value) !=
>>> meta.arg_constant.value) {
>>>                       verbose(env, "local type ID argument must be
>>> in range [0, U32_MAX]\n");
>
> .





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux