From: Yosry Ahmed <yosryahmed@xxxxxxxxxx> Subject: memcg: add per-memcg total kernel memory stat Currently memcg stats show several types of kernel memory: kernel stack, page tables, sock, vmalloc, and slab. However, there are other allocations with __GFP_ACCOUNT (or supersets such as GFP_KERNEL_ACCOUNT) that are not accounted in any of those stats, a few examples are: - various kvm allocations (e.g. allocated pages to create vcpus) - io_uring - tmp_page in pipes during pipe_write() - bpf ringbuffers - unix sockets Keeping track of the total kernel memory is essential for the ease of migration from cgroup v1 to v2 as there are large discrepancies between v1's kmem.usage_in_bytes and the sum of the available kernel memory stats in v2. Adding separate memcg stats for all __GFP_ACCOUNT kernel allocations is an impractical maintenance burden as there a lot of those all over the kernel code, with more use cases likely to show up in the future. Therefore, add a "kernel" memcg stat that is analogous to kmem page counter, with added benefits such as using rstat infrastructure which aggregates stats more efficiently. Additionally, this provides a lighter alternative in case the legacy kmem is deprecated in the future [yosryahmed@xxxxxxxxxx: v2] Link: https://lkml.kernel.org/r/20220203193856.972500-1-yosryahmed@xxxxxxxxxx Link: https://lkml.kernel.org/r/20220201200823.3283171-1-yosryahmed@xxxxxxxxxx Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> Acked-by: Shakeel Butt <shakeelb@xxxxxxxxxx> Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxxxx> Cc: Muchun Song <songmuchun@xxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- Documentation/admin-guide/cgroup-v2.rst | 5 ++++ include/linux/memcontrol.h | 1 mm/memcontrol.c | 27 +++++++++++++++++----- 3 files changed, 27 insertions(+), 6 deletions(-) --- a/Documentation/admin-guide/cgroup-v2.rst~memcg-add-per-memcg-total-kernel-memory-stat +++ a/Documentation/admin-guide/cgroup-v2.rst @@ -1301,6 +1301,11 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + kernel (npn) + Amount of total kernel memory, including + (kernel_stack, pagetables, percpu, vmalloc, slab) in + addition to other kernel memory use cases. + kernel_stack Amount of memory allocated to kernel stacks. --- a/include/linux/memcontrol.h~memcg-add-per-memcg-total-kernel-memory-stat +++ a/include/linux/memcontrol.h @@ -34,6 +34,7 @@ enum memcg_stat_item { MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, + MEMCG_KMEM, MEMCG_NR_STAT, }; --- a/mm/memcontrol.c~memcg-add-per-memcg-total-kernel-memory-stat +++ a/mm/memcontrol.c @@ -1371,6 +1371,7 @@ struct memory_stat { static const struct memory_stat memory_stats[] = { { "anon", NR_ANON_MAPPED }, { "file", NR_FILE_PAGES }, + { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, @@ -2114,6 +2115,7 @@ static DEFINE_MUTEX(percpu_charge_mutex) static void drain_obj_stock(struct obj_stock *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else static inline void drain_obj_stock(struct obj_stock *stock) @@ -2124,6 +2126,9 @@ static bool obj_stock_flush_required(str { return false; } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ +} #endif /** @@ -2979,6 +2984,18 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ + mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (nr_pages > 0) + page_counter_charge(&memcg->kmem, nr_pages); + else + page_counter_uncharge(&memcg->kmem, -nr_pages); + } +} + + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2991,8 +3008,7 @@ static void obj_cgroup_uncharge_pages(st memcg = get_mem_cgroup_from_objcg(objcg); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, -nr_pages); refill_stock(memcg, nr_pages); css_put(&memcg->css); @@ -3018,8 +3034,7 @@ static int obj_cgroup_charge_pages(struc if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_charge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -6801,8 +6816,8 @@ static void uncharge_batch(const struct page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + if (ug->nr_kmem) + memcg_account_kmem(ug->memcg, -ug->nr_kmem); memcg_oom_recover(ug->memcg); } _