Percpu memory can represent a noticeable chunk of the total memory consumption, especially on big machines with many CPUs. Let's track percpu memory usage for each memcg and display it in memory.stat. A percpu allocation is usually scattered over multiple pages (and nodes), and can be significantly smaller than a page. So let's add a byte-sized counter on the memcg level: MEMCG_PERCPU_B. Byte-sized vmstat infra created for slabs can be perfectly reused for percpu case. Signed-off-by: Roman Gushchin <guro@xxxxxx> Acked-by: Dennis Zhou <dennis@xxxxxxxxxx> --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ include/linux/memcontrol.h | 8 ++++++++ mm/memcontrol.c | 4 +++- mm/percpu.c | 10 ++++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index d09471aa7443..0715ae78ca54 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1274,6 +1274,10 @@ PAGE_SIZE multiple when read back. Amount of memory used for storing in-kernel data structures. + percpu + Amount of memory used for storing per-cpu kernel + data structures. + sock Amount of memory used in network transmission buffers diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5a8b62d075e6..d71f8a45918c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,11 +32,19 @@ struct kmem_cache; enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, + MEMCG_PERCPU_B, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, MEMCG_NR_STAT, }; +static __always_inline bool memcg_stat_item_in_bytes(int idx) +{ + if (idx == MEMCG_PERCPU_B) + return true; + return vmstat_item_in_bytes(idx); +} + enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b858cd18b52..e8ec2498a6cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -783,7 +783,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return; - if (vmstat_item_in_bytes(idx)) + if (memcg_stat_item_in_bytes(idx)) threshold <<= PAGE_SHIFT; x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); @@ -1490,6 +1490,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) seq_buf_printf(&s, "slab %llu\n", (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); + seq_buf_printf(&s, "percpu %llu\n", + (u64)memcg_page_state(memcg, MEMCG_PERCPU_B)); seq_buf_printf(&s, "sock %llu\n", (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); diff --git a/mm/percpu.c b/mm/percpu.c index c5b4f232bf37..d7717438ea8b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1610,6 +1610,11 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, if (chunk) { chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + size * num_possible_cpus()); + rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, size * num_possible_cpus()); obj_cgroup_put(objcg); @@ -1628,6 +1633,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + -(size * num_possible_cpus())); + rcu_read_unlock(); + obj_cgroup_put(objcg); } -- 2.26.2