Note: this is a simple rebase of a patch I sent a few months ago, which received two acks before the thread petered out: https://www.spinics.net/lists/cgroups/msg40602.html Thanks, On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@xxxxxxxxx> wrote: > > Other mechanisms for querying the peak memory usage of either a process > or v1 memory cgroup allow for resetting the high watermark. Restore > parity with those mechanisms. > > For example: > - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets > the high watermark. > - writing "5" to the clear_refs pseudo-file in a processes's proc > directory resets the peak RSS. > > This change copies the cgroup v1 behavior so any write to the > memory.peak and memory.swap.peak pseudo-files reset the high watermark > to the current usage. > > This behavior is particularly useful for work scheduling systems that > need to track memory usage of worker processes/cgroups per-work-item. > Since memory can't be squeezed like CPU can (the OOM-killer has > opinions), these systems need to track the peak memory usage to compute > system/container fullness when binpacking workitems. > > Signed-off-by: David Finkel <davidf@xxxxxxxxx> > --- > Documentation/admin-guide/cgroup-v2.rst | 20 +++--- > mm/memcontrol.c | 23 ++++++ > .../selftests/cgroup/test_memcontrol.c | 72 ++++++++++++++++--- > 3 files changed, 99 insertions(+), 16 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index 8fbb0519d556..201d8e5d9f82 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back. > reclaim induced by memory.reclaim. > > memory.peak > - A read-only single value file which exists on non-root > - cgroups. > + A read-write single value file which exists on non-root cgroups. > + > + The max memory usage recorded for the cgroup and its descendants since > + either the creation of the cgroup or the most recent reset. > > - The max memory usage recorded for the cgroup and its > - descendants since the creation of the cgroup. > + Any non-empty write to this file resets it to the current memory usage. > + All content written is completely ignored. > > memory.oom.group > A read-write single value file which exists on non-root > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back. > Healthy workloads are not expected to reach this limit. > > memory.swap.peak > - A read-only single value file which exists on non-root > - cgroups. > + A read-write single value file which exists on non-root cgroups. > + > + The max swap usage recorded for the cgroup and its descendants since > + the creation of the cgroup or the most recent reset. > > - The max swap usage recorded for the cgroup and its > - descendants since the creation of the cgroup. > + Any non-empty write to this file resets it to the current swap usage. > + All content written is completely ignored. > > memory.swap.max > A read-write single value file which exists on non-root > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 8f2f1bb18c9c..abfa547615d6 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -25,6 +25,7 @@ > * Copyright (C) 2020 Alibaba, Inc, Alex Shi > */ > > +#include <linux/cgroup-defs.h> > #include <linux/page_counter.h> > #include <linux/memcontrol.h> > #include <linux/cgroup.h> > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css, > return (u64)memcg->memory.watermark * PAGE_SIZE; > } > > +static ssize_t memory_peak_write(struct kernfs_open_file *of, > + char *buf, size_t nbytes, loff_t off) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); > + > + page_counter_reset_watermark(&memcg->memory); > + > + return nbytes; > +} > + > static int memory_min_show(struct seq_file *m, void *v) > { > return seq_puts_memcg_tunable(m, > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = { > .name = "peak", > .flags = CFTYPE_NOT_ON_ROOT, > .read_u64 = memory_peak_read, > + .write = memory_peak_write, > }, > { > .name = "min", > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css, > return (u64)memcg->swap.watermark * PAGE_SIZE; > } > > +static ssize_t swap_peak_write(struct kernfs_open_file *of, > + char *buf, size_t nbytes, loff_t off) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); > + > + page_counter_reset_watermark(&memcg->swap); > + > + return nbytes; > +} > + > static int swap_high_show(struct seq_file *m, void *v) > { > return seq_puts_memcg_tunable(m, > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = { > .name = "swap.peak", > .flags = CFTYPE_NOT_ON_ROOT, > .read_u64 = swap_peak_read, > + .write = swap_peak_write, > }, > { > .name = "swap.events", > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c > index 41ae8047b889..681972de673b 100644 > --- a/tools/testing/selftests/cgroup/test_memcontrol.c > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg) > /* > * This test create a memory cgroup, allocates > * some anonymous memory and some pagecache > - * and check memory.current and some memory.stat values. > + * and checks memory.current, memory.peak, and some memory.stat values. > */ > -static int test_memcg_current(const char *root) > +static int test_memcg_current_peak(const char *root) > { > int ret = KSFT_FAIL; > - long current; > + long current, peak, peak_reset; > char *memcg; > > memcg = cg_name(root, "memcg_test"); > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root) > if (current != 0) > goto cleanup; > > + peak = cg_read_long(memcg, "memory.peak"); > + if (peak != 0) > + goto cleanup; > + > if (cg_run(memcg, alloc_anon_50M_check, NULL)) > goto cleanup; > > + peak = cg_read_long(memcg, "memory.peak"); > + if (peak < MB(50)) > + goto cleanup; > + > + peak_reset = cg_write(memcg, "memory.peak", "\n"); > + if (peak_reset != 0) > + goto cleanup; > + > + peak = cg_read_long(memcg, "memory.peak"); > + if (peak > MB(30)) > + goto cleanup; > + > if (cg_run(memcg, alloc_pagecache_50M_check, NULL)) > goto cleanup; > > + peak = cg_read_long(memcg, "memory.peak"); > + if (peak < MB(50)) > + goto cleanup; > + > ret = KSFT_PASS; > > cleanup: > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) > > /* > * This test checks that memory.swap.max limits the amount of > - * anonymous memory which can be swapped out. > + * anonymous memory which can be swapped out. Additionally, it verifies that > + * memory.swap.peak reflects the high watermark and can be reset. > */ > -static int test_memcg_swap_max(const char *root) > +static int test_memcg_swap_max_peak(const char *root) > { > int ret = KSFT_FAIL; > char *memcg; > - long max; > + long max, peak; > > if (!is_swap_enabled()) > return KSFT_SKIP; > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root) > goto cleanup; > } > > + if (cg_read_long(memcg, "memory.swap.peak")) > + goto cleanup; > + > + if (cg_read_long(memcg, "memory.peak")) > + goto cleanup; > + > if (cg_read_strcmp(memcg, "memory.max", "max\n")) > goto cleanup; > > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root) > if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) > goto cleanup; > > + peak = cg_read_long(memcg, "memory.peak"); > + if (peak < MB(29)) > + goto cleanup; > + > + peak = cg_read_long(memcg, "memory.swap.peak"); > + if (peak < MB(29)) > + goto cleanup; > + > + if (cg_write(memcg, "memory.swap.peak", "\n")) > + goto cleanup; > + > + if (cg_read_long(memcg, "memory.swap.peak") > MB(10)) > + goto cleanup; > + > + > + if (cg_write(memcg, "memory.peak", "\n")) > + goto cleanup; > + > + if (cg_read_long(memcg, "memory.peak")) > + goto cleanup; > + > if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30))) > goto cleanup; > > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root) > if (max <= 0) > goto cleanup; > > + peak = cg_read_long(memcg, "memory.peak"); > + if (peak < MB(29)) > + goto cleanup; > + > + peak = cg_read_long(memcg, "memory.swap.peak"); > + if (peak < MB(19)) > + goto cleanup; > + > ret = KSFT_PASS; > > cleanup: > @@ -1295,7 +1351,7 @@ struct memcg_test { > const char *name; > } tests[] = { > T(test_memcg_subtree_control), > - T(test_memcg_current), > + T(test_memcg_current_peak), > T(test_memcg_min), > T(test_memcg_low), > T(test_memcg_high), > @@ -1303,7 +1359,7 @@ struct memcg_test { > T(test_memcg_max), > T(test_memcg_reclaim), > T(test_memcg_oom_events), > - T(test_memcg_swap_max), > + T(test_memcg_swap_max_peak), > T(test_memcg_sock), > T(test_memcg_oom_group_leaf_events), > T(test_memcg_oom_group_parent_events), > -- > 2.40.1 > -- David Finkel Senior Principal Software Engineer, Core Services