From: Alexey Gladkov <legion@xxxxxxxxxx> The /proc/meminfo contains information regardless of the cgroups restrictions. This file is still widely used [1]. This means that all these programs will not work correctly inside container [2][3][4]. Some programs try to respect the cgroups limits, but not all of them implement support for all cgroup versions [5]. Correct information can be obtained from cgroups, but this requires the cgroups to be available inside container and the correct version of cgroups to be supported. There is lxcfs [6] that emulates /proc/meminfo using fuse to provide information regarding cgroups. This patch can help them. This patch adds /proc/self/meminfo that contains a subset of /proc/meminfo respecting cgroup restrictions. We cannot just create /proc/self/meminfo and make a symlink at the old location because this will break the existing apparmor rules [7]. Therefore, the patch adds a separate file with the same format. [1] https://codesearch.debian.net/search?q=%2Fproc%2Fmeminfo [2] https://sources.debian.org/src/erlang/1:23.2.6+dfsg-1/lib/os_mon/c_src/memsup.c#L300 [3] https://sources.debian.org/src/p7zip/16.02+dfsg-8/CPP/Windows/System.cpp/#L103 [4] https://sources.debian.org/src/systemd/247.3-5/src/oom/oomd.c/#L138 [5] https://sources.debian.org/src/nodejs/12.21.0%7Edfsg-4/deps/uv/src/unix/linux-core.c/#L1059 [6] https://linuxcontainers.org/lxcfs/ [7] https://gitlab.com/apparmor/apparmor/-/blob/master/profiles/apparmor.d/abstractions/base#L98 Signed-off-by: Alexey Gladkov <legion@xxxxxxxxxx> --- fs/proc/base.c | 2 + fs/proc/internal.h | 6 ++ fs/proc/meminfo.c | 160 +++++++++++++++++++++++-------------- include/linux/memcontrol.h | 2 + include/linux/mm.h | 15 ++++ mm/memcontrol.c | 80 +++++++++++++++++++ mm/page_alloc.c | 28 ++++--- 7 files changed, 222 insertions(+), 71 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 58bbf334265b..e95837cf713f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3269,6 +3269,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif + ONE("meminfo", S_IRUGO, proc_meminfo_show), }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3602,6 +3603,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif + ONE("meminfo", S_IRUGO, proc_meminfo_show), }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 03415f3fb3a8..a6e8540afbd3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -241,6 +241,12 @@ extern int proc_net_init(void); static inline int proc_net_init(void) { return 0; } #endif +/* + * meminfo.c + */ +extern int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); + /* * proc_self.c */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6fa761c9cc78..3587a79d4b96 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -16,6 +16,9 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#ifdef CONFIG_MEMCG +#include <linux/memcontrol.h> +#endif #include <asm/page.h> #include "internal.h" @@ -23,91 +26,112 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } +static void proc_fill_meminfo(struct meminfo *mi) +{ + int lru; + long cached; + + si_meminfo(&mi->si); + si_swapinfo(&mi->si); + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + mi->pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - mi->si.bufferram; + if (cached < 0) + cached = 0; + + mi->cached = cached; + mi->swapcached = total_swapcache_pages(); + mi->slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + mi->slab_unreclaimable = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); + mi->anon_pages = global_node_page_state(NR_ANON_MAPPED); + mi->mapped = global_node_page_state(NR_FILE_MAPPED); + mi->nr_pagetable = global_node_page_state(NR_PAGETABLE); + mi->dirty_pages = global_node_page_state(NR_FILE_DIRTY); + mi->writeback_pages = global_node_page_state(NR_WRITEBACK); +} + +#ifdef CONFIG_MEMCG +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task) +{ + mem_fill_meminfo(mi, task); +} +#else +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task) +{ + proc_fill_meminfo(mi); +} +#endif + static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } +static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi) +{ + show_val_kb(m, "MemTotal: ", mi->si.totalram); + show_val_kb(m, "MemFree: ", mi->si.freeram); + show_val_kb(m, "MemAvailable: ", si_mem_available_mi(mi)); + show_val_kb(m, "Buffers: ", mi->si.bufferram); + show_val_kb(m, "Cached: ", mi->cached); + show_val_kb(m, "SwapCached: ", mi->swapcached); + show_val_kb(m, "Active: ", mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive: ", mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Active(anon): ", mi->pages[LRU_ACTIVE_ANON]); + show_val_kb(m, "Inactive(anon): ", mi->pages[LRU_INACTIVE_ANON]); + show_val_kb(m, "Active(file): ", mi->pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive(file): ", mi->pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Unevictable: ", mi->pages[LRU_UNEVICTABLE]); + +#ifdef CONFIG_HIGHMEM + show_val_kb(m, "HighTotal: ", mi->si.totalhigh); + show_val_kb(m, "HighFree: ", mi->si.freehigh); + show_val_kb(m, "LowTotal: ", mi->si.totalram - mi->si.totalhigh); + show_val_kb(m, "LowFree: ", mi->si.freeram - mi->si.freehigh); +#endif + + show_val_kb(m, "SwapTotal: ", mi->si.totalswap); + show_val_kb(m, "SwapFree: ", mi->si.freeswap); + show_val_kb(m, "Dirty: ", mi->dirty_pages); + show_val_kb(m, "Writeback: ", mi->writeback_pages); + + show_val_kb(m, "AnonPages: ", mi->anon_pages); + show_val_kb(m, "Mapped: ", mi->mapped); + show_val_kb(m, "Shmem: ", mi->si.sharedram); + show_val_kb(m, "Slab: ", mi->slab_reclaimable + mi->slab_unreclaimable); + show_val_kb(m, "SReclaimable: ", mi->slab_reclaimable); + show_val_kb(m, "SUnreclaim: ", mi->slab_unreclaimable); + show_val_kb(m, "PageTables: ", mi->nr_pagetable); + + return 0; +} + static int meminfo_proc_show(struct seq_file *m, void *v) { - struct sysinfo i; - unsigned long committed; - long cached; - long available; - unsigned long pages[NR_LRU_LISTS]; - unsigned long sreclaimable, sunreclaim; - int lru; - si_meminfo(&i); - si_swapinfo(&i); - committed = vm_memory_committed(); + struct meminfo mi; - cached = global_node_page_state(NR_FILE_PAGES) - - total_swapcache_pages() - i.bufferram; - if (cached < 0) - cached = 0; + proc_fill_meminfo(&mi); + meminfo_proc_show_mi(m, &mi); - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); - - available = si_mem_available(); - sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); - sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); - - show_val_kb(m, "MemTotal: ", i.totalram); - show_val_kb(m, "MemFree: ", i.freeram); - show_val_kb(m, "MemAvailable: ", available); - show_val_kb(m, "Buffers: ", i.bufferram); - show_val_kb(m, "Cached: ", cached); - show_val_kb(m, "SwapCached: ", total_swapcache_pages()); - show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + - pages[LRU_ACTIVE_FILE]); - show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + - pages[LRU_INACTIVE_FILE]); - show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); - show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); - show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); - show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); - show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); -#ifdef CONFIG_HIGHMEM - show_val_kb(m, "HighTotal: ", i.totalhigh); - show_val_kb(m, "HighFree: ", i.freehigh); - show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); - show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); -#endif - #ifndef CONFIG_MMU show_val_kb(m, "MmapCopy: ", (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif - show_val_kb(m, "SwapTotal: ", i.totalswap); - show_val_kb(m, "SwapFree: ", i.freeswap); - show_val_kb(m, "Dirty: ", - global_node_page_state(NR_FILE_DIRTY)); - show_val_kb(m, "Writeback: ", - global_node_page_state(NR_WRITEBACK)); - show_val_kb(m, "AnonPages: ", - global_node_page_state(NR_ANON_MAPPED)); - show_val_kb(m, "Mapped: ", - global_node_page_state(NR_FILE_MAPPED)); - show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "KReclaimable: ", sreclaimable + + show_val_kb(m, "KReclaimable: ", mi.slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); - show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); - show_val_kb(m, "SReclaimable: ", sreclaimable); - show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", global_node_page_state(NR_KERNEL_SCS_KB)); #endif - show_val_kb(m, "PageTables: ", - global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", @@ -115,7 +139,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); - show_val_kb(m, "Committed_AS: ", committed); + show_val_kb(m, "Committed_AS: ", vm_memory_committed()); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); @@ -153,6 +177,20 @@ static int meminfo_proc_show(struct seq_file *m, void *v) return 0; } +int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct meminfo mi; + + fill_meminfo(&mi, task); + + meminfo_proc_show_mi(m, &mi); + hugetlb_report_meminfo(m); + arch_report_meminfo(m); + + return 0; +} + static int __init proc_meminfo_init(void) { proc_create_single("meminfo", 0, NULL, meminfo_proc_show); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c193be760709..4a7e2894954f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1119,6 +1119,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 diff --git a/include/linux/mm.h b/include/linux/mm.h index c274f75efcf9..7faeaddd5b88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2467,6 +2467,20 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif +struct meminfo { + struct sysinfo si; + unsigned long pages[NR_LRU_LISTS]; + unsigned long cached; + unsigned long swapcached; + unsigned long anon_pages; + unsigned long mapped; + unsigned long nr_pagetable; + unsigned long dirty_pages; + unsigned long writeback_pages; + unsigned long slab_reclaimable; + unsigned long slab_unreclaimable; +}; + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, @@ -2477,6 +2491,7 @@ extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(unsigned int flags, nodemask_t *nodemask); +extern long si_mem_available_mi(struct meminfo *mi); extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64ada9e650a5..344b546f9e25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3750,6 +3750,86 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return nr; } +static void mem_cgroup_nr_pages(struct mem_cgroup *memcg, int nid, unsigned long *pages) +{ + struct mem_cgroup *iter; + int i; + + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < NR_LRU_LISTS; i++) + pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid, BIT(i), false); + } +} + +static void mem_cgroup_si_meminfo(struct sysinfo *si, struct task_struct *task) +{ + unsigned long memtotal, memused, swapsize; + struct mem_cgroup *memcg; + struct cgroup_subsys_state *css; + + css = task_css(task, memory_cgrp_id); + memcg = mem_cgroup_from_css(css); + + memtotal = READ_ONCE(memcg->memory.max); + + if (memtotal != PAGE_COUNTER_MAX) { + memused = page_counter_read(&memcg->memory); + + si->totalram = memtotal; + si->freeram = (memtotal > memused ? memtotal - memused : 0); + si->sharedram = memcg_page_state(memcg, NR_SHMEM); + + si->bufferram = nr_blockdev_pages(); + si->totalhigh = totalhigh_pages(); + si->freehigh = nr_free_highpages(); + si->mem_unit = PAGE_SIZE; + } else { + si_meminfo(si); + memused = si->totalram - si->freeram; + } + + swapsize = READ_ONCE(memcg->memsw.max); + + if (swapsize != PAGE_COUNTER_MAX) { + unsigned long swaptotal, swapused; + + swaptotal = swapsize - memtotal; + swapused = page_counter_read(&memcg->memsw) - memused; + si->totalswap = swaptotal; + /* Due to global reclaim, memory.memsw.usage can be greater than + * (memory.memsw.max - memory.max). */ + si->freeswap = (swaptotal > swapused ? swaptotal - swapused : 0); + } else { + si_swapinfo(si); + } + + css_put(css); +} + +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task) +{ + struct cgroup_subsys_state *memcg_css = task_css(task, memory_cgrp_id); + struct mem_cgroup *memcg = mem_cgroup_from_css(memcg_css); + int nid; + + memset(&mi->pages, 0, sizeof(mi->pages)); + + mem_cgroup_si_meminfo(&mi->si, task); + + for_each_online_node(nid) + mem_cgroup_nr_pages(memcg, nid, mi->pages); + + mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B); + mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + mi->cached = memcg_page_state(memcg, NR_FILE_PAGES); + mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE); + mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED); + mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED); + mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE); + mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY); + mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK); +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aaa1655cf682..0a3c9dcd2c13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5551,18 +5551,13 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } -long si_mem_available(void) +long si_mem_available_mi(struct meminfo *mi) { long available; unsigned long pagecache; unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; unsigned long reclaimable; struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += low_wmark_pages(zone); @@ -5571,14 +5566,14 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = mi->si.freeram - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will * start swapping. Assume at least half of the page cache, or the * low watermark worth of cache, needs to stay. */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache = mi->pages[LRU_ACTIVE_FILE] + mi->pages[LRU_INACTIVE_FILE]; pagecache -= min(pagecache / 2, wmark_low); available += pagecache; @@ -5587,14 +5582,27 @@ long si_mem_available(void) * items that are in use, and cannot be freed. Cap this estimate at the * low watermark. */ - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + reclaimable = mi->slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; return available; } + +long si_mem_available(void) +{ + struct meminfo mi; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + mi.pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + mi.si.freeram = global_zone_page_state(NR_FREE_PAGES); + mi.slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + + return si_mem_available_mi(&mi); +} EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) -- 2.29.3