The current control mechanism for memory cgroup v2 lumps all the memory together irrespective of the type of memory objects. However, there are cases where users may have more concern about one type of memory usage than the others. In order to support finer-grained control of memory usage, the following two new cgroup v2 control files are added: - memory.subset.list Either "" (default), "anon" (anonymous memory) or "file" (file cache). It specifies the type of memory objects we want to monitor. - memory.subset.high The high memory limit for the memory type specified in "memory.subset.list". For simplicity, the limit is for memory usage by all the tasks within the current memory cgroup only. It doesn't include memory usage by other tasks in child memory cgroups. Hence, we can just check the corresponding stat[] array entry of the selected memory type to see if it is above the limit. We currently don't have the capability to specify the type of memory objects to reclaim. When memory reclaim is triggered after reaching the "memory.subset.high" limit, other type of memory objects will also be reclaimed. In the future, we may extend this capability to allow even more fine-grained selection of memory types as well as a combination of them if the need arises. A test program was written to allocate 1 Gbytes of memory and then touch every pages of them. This program was then run in a memory cgroup: # echo anon > memory.subset.list # echo 10485760 > memory.subset.high # echo $$ > cgroup.procs # ~/touch-1gb While the test program was running: # grep -w anon memory.stat anon 10817536 It was a bit higher than the limit, but that should be OK. Without setting the limit, the output would be # grep -w anon memory.stat anon 1074335744 Signed-off-by: Waiman Long <longman@xxxxxxxxxx> --- Documentation/admin-guide/cgroup-v2.rst | 35 +++++++++ include/linux/memcontrol.h | 7 ++ mm/memcontrol.c | 96 ++++++++++++++++++++++++- 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 20f92c16ffbf..0d5b7c77897d 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1080,6 +1080,41 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.subset.high + A read-write single value file which exists on non-root cgroups. + The default is "max". + + Memory usage throttle limit for a subset of memory objects with + types specified in "memory.subset.list". If a cgroup's usage for + those memory objects goes over the high boundary, the processes + of the cgroup are throttled and put under heavy reclaim pressure. + + This throttle limit is not allowed to go higher than + "memory.high" and will be adjusted accordingly when "memory.high" + is changed. Because of that, "memory.subset.list" should always + be set first before assigning a limit to this file. + + Unlike "memory.high", "memory.subset.high" does not count memory + objects usage in child cgroups. + + Going over the high limit never invokes the OOM killer and + under extreme conditions the limit may be breached. + + memory.subset.list + A read-write single value file which exists on non-root cgroups. + The default is "" which means no separate memory subcomponent + tracking and throttling. + + Currently, only the following two primary subcompoent types are + supported: + + - anon (anonymous memory) + - file (filesystem cache, including tmpfs and shared memory) + + The value of this file should either be "", "anon" or "file". + Changing its value resets "memory.subset.high" to be the same + as "memory.high". + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f3d880b7ca1..1baf3e4a9eeb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -212,6 +212,13 @@ struct mem_cgroup { /* Upper bound of normal memory consumption range */ unsigned long high; + /* + * Upper memory consumption bound for a subset of memory object type + * specified in subset_list for the current cgroup only. + */ + unsigned long subset_high; + unsigned long subset_list; + /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 532e0e2a4817..7e52adea60d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2145,6 +2145,14 @@ static void reclaim_high(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) { + int mtype = READ_ONCE(memcg->subset_list); + + /* + * Try memory reclaim if subset_high is exceeded. + */ + if (mtype && (memcg_page_state(memcg, mtype) > memcg->subset_high)) + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; @@ -2190,6 +2198,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool may_swap = true; bool drained = false; bool oomed = false; + bool over_subset_high = false; enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) @@ -2323,6 +2332,10 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + if (memcg->subset_list && + (memcg_page_state(memcg, memcg->subset_list) > memcg->subset_high)) + over_subset_high = true; + /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2333,7 +2346,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > memcg->high) { + if (page_counter_read(&memcg->memory) > memcg->high || + over_subset_high) { /* Don't bother a random interrupted task */ if (in_interrupt()) { schedule_work(&memcg->high_work); @@ -2343,6 +2357,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, set_notify_resume(current); break; } + over_subset_high = false; } while ((memcg = parent_mem_cgroup(memcg))); return 0; @@ -4491,6 +4506,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(error); memcg->high = PAGE_COUNTER_MAX; + memcg->subset_high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); @@ -5447,6 +5463,13 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + /* + * Synchronize subset_high if subset_list not set and lower + * subset_high, if necessary. + */ + if (!memcg->subset_list || (high < memcg->subset_high)) + memcg->subset_high = high; + nr_pages = page_counter_read(&memcg->memory); if (nr_pages > high) try_to_free_mem_cgroup_pages(memcg, nr_pages - high, @@ -5511,6 +5534,65 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static int memory_subset_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->subset_high)); +} + +static ssize_t memory_subset_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + if (high > memcg->high) + return -EINVAL; + + memcg->subset_high = high; + return nbytes; +} + +static int memory_subset_list_show(struct seq_file *m, void *v) +{ + unsigned long mtype = READ_ONCE(mem_cgroup_from_seq(m)->subset_list); + + seq_puts(m, (mtype == MEMCG_RSS) ? "anon\n" : + (mtype == MEMCG_CACHE) ? "file\n" : "\n"); + return 0; +} + +static ssize_t memory_subset_list_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long mtype; + + buf = strstrip(buf); + if (!strcmp(buf, "anon")) + mtype = MEMCG_RSS; + else if (!strcmp(buf, "file")) + mtype = MEMCG_CACHE; + else if (buf[0] == '\0') + mtype = 0; + else + return -EINVAL; + + if (mtype == memcg->subset_list) + return nbytes; + + memcg->subset_list = mtype; + /* Reset subset_high */ + memcg->subset_high = memcg->high; + return nbytes; +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); @@ -5699,6 +5781,18 @@ static struct cftype memory_files[] = { .seq_show = memory_oom_group_show, .write = memory_oom_group_write, }, + { + .name = "subset.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_subset_high_show, + .write = memory_subset_high_write, + }, + { + .name = "subset.list", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_subset_list_show, + .write = memory_subset_list_write, + }, { } /* terminate */ }; -- 2.18.1