A new item 'bpf' is introduced into memory.stat, then we can get the memory consumed by bpf. Currently only the memory of bpf-map is accounted. The accouting of this new item is implemented with scope-based accouting, which is similar to set_active_memcg(). In this scope, the memory allocated will be accounted or unaccounted to a specific item, which is specified by set_active_memcg_item(). The result in cgroup v1 as follows, $ cat /sys/fs/cgroup/memory/foo/memory.stat | grep bpf bpf 109056000 total_bpf 109056000 After the map is removed, the counter will become zero again. $ cat /sys/fs/cgroup/memory/foo/memory.stat | grep bpf bpf 0 total_bpf 0 The 'bpf' may not be 0 after the bpf-map is destroyed, because there may be cached objects. Note that there's no kmemcg in root memory cgroup, so the item 'bpf' will be always 0 in root memory cgroup. If a bpf-map is charged into root memcg directly, its memory size will not be accounted, so the 'total_bpf' can't be used to monitor system-wide bpf memory consumption yet. Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> --- include/linux/bpf.h | 10 ++++++++-- include/linux/memcontrol.h | 1 + include/linux/sched.h | 1 + include/linux/sched/mm.h | 24 ++++++++++++++++++++++++ kernel/bpf/memalloc.c | 10 ++++++++++ kernel/bpf/ringbuf.c | 4 ++++ kernel/bpf/syscall.c | 40 ++++++++++++++++++++++++++++++++++++++-- kernel/fork.c | 1 + mm/memcontrol.c | 20 ++++++++++++++++++++ 9 files changed, 107 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f7a4cfc..9eda143 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1725,7 +1725,13 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, void bpf_map_kvfree(const void *ptr); void bpf_map_free_percpu(void __percpu *ptr); -#define bpf_map_kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) +#define bpf_map_kfree_rcu(ptr, rhf...) { \ + int old_item; \ + \ + old_item = set_active_memcg_item(MEMCG_BPF); \ + kvfree_rcu(ptr, ## rhf); \ + set_active_memcg_item(old_item); \ +} #else static inline void * @@ -1771,7 +1777,7 @@ static inline void bpf_map_free_percpu(void __percpu *ptr) #define bpf_map_kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) -#endif +#endif /* CONFIG_MEMCG_KMEM */ extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d4a0ad3..f345467 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -37,6 +37,7 @@ enum memcg_stat_item { MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_BPF, MEMCG_NR_STAT, }; diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a..79362da 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1423,6 +1423,7 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; + int active_item; #endif #ifdef CONFIG_BLK_CGROUP diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2a24361..3a334c7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -363,6 +363,7 @@ static inline void memalloc_pin_restore(unsigned int flags) #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); +DECLARE_PER_CPU(int, int_active_item); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. @@ -389,12 +390,35 @@ static inline void memalloc_pin_restore(unsigned int flags) return old; } + +static inline int +set_active_memcg_item(int item) +{ + int old_item; + + if (!in_task()) { + old_item = this_cpu_read(int_active_item); + this_cpu_write(int_active_item, item); + } else { + old_item = current->active_item; + current->active_item = item; + } + + return old_item; +} + #else static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { return NULL; } + +static inline int +set_active_memcg_item(int item) +{ + return MEMCG_NR_STAT; +} #endif #ifdef CONFIG_MEMBARRIER diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5f83be1..51d59d4 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -165,11 +165,14 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) { struct mem_cgroup *memcg = NULL, *old_memcg; unsigned long flags; + int old_item; void *obj; int i; memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); + old_item = set_active_memcg_item(MEMCG_BPF); + for (i = 0; i < cnt; i++) { obj = __alloc(c, node); if (!obj) @@ -194,19 +197,26 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(flags); } + + set_active_memcg_item(old_item); set_active_memcg(old_memcg); mem_cgroup_put(memcg); } static void free_one(struct bpf_mem_cache *c, void *obj) { + int old_item; + + old_item = set_active_memcg_item(MEMCG_BPF); if (c->percpu_size) { free_percpu(((void **)obj)[1]); kfree(obj); + set_active_memcg_item(old_item); return; } kfree(obj); + set_active_memcg_item(old_item); } static void __free_rcu(struct rcu_head *head) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 535e440..72435bd 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -61,7 +61,11 @@ struct bpf_ringbuf_hdr { static inline void bpf_map_free_page(struct page *page) { + int old_item; + + old_item = set_active_memcg_item(MEMCG_BPF); __free_page(page); + set_active_memcg_item(old_item); } static void bpf_ringbuf_pages_free(struct page **pages, int nr_pages) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b9250c8..703aa6a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -340,11 +340,14 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; unsigned int flags = 0; unsigned long align = 1; + int old_item; void *area; + void *ptr; if (size >= SIZE_MAX) return NULL; + old_item = set_active_memcg_item(MEMCG_BPF); /* kmalloc()'ed memory can't be mmap()'ed */ if (mmapable) { BUG_ON(!PAGE_ALIGNED(size)); @@ -353,13 +356,18 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); - if (area != NULL) + if (area != NULL) { + set_active_memcg_item(old_item); return area; + } } - return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + ptr = __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, flags, numa_node, __builtin_return_address(0)); + + set_active_memcg_item(old_item); + return ptr; } void *bpf_map_area_alloc(u64 size, int numa_node, struct bpf_map *map) @@ -386,9 +394,13 @@ void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) void bpf_map_area_free(void *area, struct bpf_map *map) { + int old_item; + if (map) bpf_map_release_memcg(map); + old_item = set_active_memcg_item(MEMCG_BPF); kvfree(area); + set_active_memcg_item(old_item); } static u32 bpf_map_flags_retain_permanent(u32 flags) @@ -464,11 +476,14 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; + int old_item; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); + old_item = set_active_memcg_item(MEMCG_BPF); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); + set_active_memcg_item(old_item); set_active_memcg(old_memcg); bpf_map_put_memcg(memcg); @@ -479,10 +494,13 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; + int old_item; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); + old_item = set_active_memcg_item(MEMCG_BPF); ptr = kzalloc(size, flags | __GFP_ACCOUNT); + set_active_memcg_item(old_item); set_active_memcg(old_memcg); bpf_map_put_memcg(memcg); @@ -494,11 +512,14 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, { struct mem_cgroup *memcg, *old_memcg; void *ptr; + int old_item; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); + old_item = set_active_memcg_item(MEMCG_BPF); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + set_active_memcg_item(old_item); bpf_map_put_memcg(memcg); return ptr; @@ -509,10 +530,13 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, { struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; + int old_item; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); + old_item = set_active_memcg_item(MEMCG_BPF); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); + set_active_memcg_item(old_item); set_active_memcg(old_memcg); bpf_map_put_memcg(memcg); @@ -521,17 +545,29 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, void bpf_map_kfree(const void *ptr) { + int old_item; + + old_item = set_active_memcg_item(MEMCG_BPF); kfree(ptr); + set_active_memcg_item(old_item); } void bpf_map_kvfree(const void *ptr) { + int old_item; + + old_item = set_active_memcg_item(MEMCG_BPF); kvfree(ptr); + set_active_memcg_item(old_item); } void bpf_map_free_percpu(void __percpu *ptr) { + int old_item; + + old_item = set_active_memcg_item(MEMCG_BPF); free_percpu(ptr); + set_active_memcg_item(old_item); } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b1..dac2429 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1043,6 +1043,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; + tsk->active_item = 0; #endif #ifdef CONFIG_CPU_SUP_INTEL diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c..9008417 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -82,6 +82,10 @@ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); +/* Active memory cgroup to use from an interrupt context */ +DEFINE_PER_CPU(int, int_active_item); +EXPORT_PER_CPU_SYMBOL_GPL(int_active_item); + /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket __ro_after_init; @@ -923,6 +927,14 @@ static __always_inline struct mem_cgroup *active_memcg(void) return current->active_memcg; } +static __always_inline int active_memcg_item(void) +{ + if (!in_task()) + return this_cpu_read(int_active_item); + + return current->active_item; +} + /** * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. * @mm: mm from which memcg should be extracted. It can be NULL. @@ -1436,6 +1448,7 @@ struct memory_stat { { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, + { "bpf", MEMCG_BPF }, }; /* Translate stat items to the correct unit for memory.stat output */ @@ -2993,6 +3006,11 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) { + int item = active_memcg_item(); + + WARN_ON_ONCE(item != 0 && (item < MEMCG_SWAP || item >= MEMCG_NR_STAT)); + if (item) + mod_memcg_state(memcg, item, nr_pages); mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { if (nr_pages > 0) @@ -3976,6 +3994,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) NR_FILE_DIRTY, NR_WRITEBACK, MEMCG_SWAP, + MEMCG_BPF, }; static const char *const memcg1_stat_names[] = { @@ -3989,6 +4008,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) "dirty", "writeback", "swap", + "bpf", }; /* Universal VM events cgroup1 shows, original sort order */ -- 1.8.3.1