In the effort of supporting cgroups v2 into Kubernetes, I stumped on the lack of the hugetlb controller. When the controller is enabled, it exposes three new files for each hugetlb size on non-root cgroups: - hugetlb.<hugepagesize>.current - hugetlb.<hugepagesize>.max - hugetlb.<hugepagesize>.events The differences with the legacy hierarchy are in the file names and using the value "max" instead of "-1" to disable a limit. The file .limit_in_bytes is renamed to .max. The file .usage_in_bytes is renamed to .usage. .failcnt is not provided as a single file anymore, but its value can be read in the new flat-keyed file .events, through the "max" key. Signed-off-by: Giuseppe Scrivano <gscrivan@xxxxxxxxxx> --- v3: - simplify hugetlb_cgroup_read_u64_max and drop dead code - notify changes to the .events file v2: https://www.spinics.net/lists/cgroups/msg23917.html - dropped max_usage_in_bytes and renamed .stats::failcnt to .events::max v1: https://www.spinics.net/lists/cgroups/msg23893.html Documentation/admin-guide/cgroup-v2.rst | 24 ++++ include/linux/hugetlb.h | 3 +- mm/hugetlb_cgroup.c | 140 ++++++++++++++++++++++-- 3 files changed, 155 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5361ebec3361..5e08a202da2a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/. 5-6. Device 5-7. RDMA 5-7-1. RDMA Interface Files + 5-8. HugeTLB + 5.8-1. HugeTLB Interface Files 5-8. Misc 5-8-1. perf_event 5-N. Non-normative information @@ -2050,6 +2052,28 @@ RDMA Interface Files mlx4_0 hca_handle=1 hca_object=20 ocrdma1 hca_handle=1 hca_object=23 +HugeTLB +------- + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. + +HugeTLB Interface Files +~~~~~~~~~~~~~~~~~~~~~~~ + + hugetlb.<hugepagesize>.current + Show current usage for "hugepagesize" hugetlb. It exists for all + the cgroup except root. + + hugetlb.<hugepagesize>.max + Set/show the hard limit of "hugepagesize" hugetlb usage. + The default value is "max". It exists for all the cgroup except root. + + hugetlb.<hugepagesize>.events + A read-only flat-keyed file which exists on non-root cgroups. + + max + The number of allocation failure due to HugeTLB limit Misc ---- diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 53fc34f930d0..1c2bacbca044 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -340,7 +340,8 @@ struct hstate { unsigned int surplus_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ - struct cftype cgroup_files[5]; + struct cftype cgroup_files_dfl[5]; + struct cftype cgroup_files_legacy[5]; #endif char name[HSTATE_NAME_LEN]; }; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 2ac38bdc18a1..888c5739dc41 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -21,6 +21,10 @@ struct hugetlb_cgroup { struct cgroup_subsys_state css; + + /* Handle for "hugetlb.events" */ + struct cgroup_file events_file; + /* * the counter to account for hugepages from hugetlb. */ @@ -202,8 +206,11 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, } rcu_read_unlock(); - if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) + if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, + &counter)) { ret = -ENOMEM; + cgroup_file_notify(&h_cg->events_file); + } css_put(&h_cg->css); done: *ptr = h_cg; @@ -283,10 +290,46 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) +{ + int idx; + u64 val; + bool write_raw = false; + struct cftype *cft = seq_cft(seq); + unsigned long limit; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + counter = &h_cg->hugepage[idx]; + + limit = round_down(PAGE_COUNTER_MAX, + 1 << huge_page_order(&hstates[idx])); + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + val = (u64)page_counter_read(counter); + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + case RES_LIMIT: + val = (u64)counter->max; + if (val == limit) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + default: + BUG(); + } + + return 0; +} + static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) + char *buf, size_t nbytes, loff_t off, + const char *max) { int ret, idx; unsigned long nr_pages; @@ -296,7 +339,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return -EINVAL; buf = strstrip(buf); - ret = page_counter_memparse(buf, "-1", &nr_pages); + ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; @@ -316,6 +359,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret ?: nbytes; } +static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); +} + +static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); +} + static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -350,7 +405,58 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -static void __init __hugetlb_cgroup_file_init(int idx) +static int hugetlb_events_show(struct seq_file *seq, void *v) +{ + struct page_counter *counter; + struct cftype *cft = seq_cft(seq); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; + seq_printf(seq, "max %lu\n", counter->failcnt); + + return 0; +} + +static void __init __hugetlb_cgroup_file_dfl_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, 32, huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files_dfl[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->write = hugetlb_cgroup_write_dfl; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the current usage file */ + cft = &h->cgroup_files_dfl[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events file */ + cft = &h->cgroup_files_dfl[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); + cft->seq_show = hugetlb_events_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, events_file), + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[3]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files_dfl)); +} + +static void __init __hugetlb_cgroup_file_legacy_init(int idx) { char buf[32]; struct cftype *cft; @@ -360,38 +466,44 @@ static void __init __hugetlb_cgroup_file_init(int idx) mem_fmt(buf, 32, huge_page_size(h)); /* Add the limit file */ - cft = &h->cgroup_files[0]; + cft = &h->cgroup_files_legacy[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; - cft->write = hugetlb_cgroup_write; + cft->write = hugetlb_cgroup_write_legacy; /* Add the usage file */ - cft = &h->cgroup_files[1]; + cft = &h->cgroup_files_legacy[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ - cft = &h->cgroup_files[2]; + cft = &h->cgroup_files_legacy[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ - cft = &h->cgroup_files[3]; + cft = &h->cgroup_files_legacy[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ - cft = &h->cgroup_files[4]; + cft = &h->cgroup_files_legacy[4]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, - h->cgroup_files)); + h->cgroup_files_legacy)); +} + +static void __init __hugetlb_cgroup_file_init(int idx) +{ + __hugetlb_cgroup_file_dfl_init(idx); + __hugetlb_cgroup_file_legacy_init(idx); } void __init hugetlb_cgroup_file_init(void) @@ -433,8 +545,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } +static struct cftype hugetlb_files[] = { + {} /* terminate */ +}; + struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, + .dfl_cftypes = hugetlb_files, + .legacy_cftypes = hugetlb_files, }; -- 2.23.0