On Thu, Nov 21, 2019 at 1:14 PM Giuseppe Scrivano <gscrivan@xxxxxxxxxx> wrote: > > In the effort of supporting cgroups v2 into Kubernetes, I stumped on > the lack of the hugetlb controller. > > When the controller is enabled, it exposes three new files for each > hugetlb size on non-root cgroups: > > - hugetlb.<hugepagesize>.current > - hugetlb.<hugepagesize>.max > - hugetlb.<hugepagesize>.stat > > The differences with the legacy hierarchy are in the file names and > using the value "max" instead of "-1" to disable a limit. > > The file .limit_in_bytes is renamed to .max. > > The file .usage_in_bytes is renamed to .usage. > I could be wrong here but I think the memcg files are not renamed, so the same file names exist in v1 and v2. Can we follow that example? > .failcnt and .max_usage_in_bytes are not provided as single files > anymore, but their value can be read through the new flat-keyed file > .stat, respectively through the max_usage and failcnt keys. > > Signed-off-by: Giuseppe Scrivano <gscrivan@xxxxxxxxxx> > --- > Documentation/admin-guide/cgroup-v2.rst | 28 +++++ > include/linux/hugetlb.h | 3 +- > mm/hugetlb_cgroup.c | 142 ++++++++++++++++++++++-- > 3 files changed, 162 insertions(+), 11 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index 5361ebec3361..662cb39abbf5 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/. > 5-6. Device > 5-7. RDMA > 5-7-1. RDMA Interface Files > + 5-8. HugeTLB > + 5.8-1. HugeTLB Interface Files > 5-8. Misc > 5-8-1. perf_event > 5-N. Non-normative information > @@ -2050,6 +2052,32 @@ RDMA Interface Files > mlx4_0 hca_handle=1 hca_object=20 > ocrdma1 hca_handle=1 hca_object=23 > > +HugeTLB > +------- > + > +The HugeTLB controller allows to limit the HugeTLB usage per control group and > +enforces the controller limit during page fault. > + > +HugeTLB Interface Files > +~~~~~~~~~~~~~~~~~~~~~~~ > + > + hugetlb.<hugepagesize>.current > + Show current usage for "hugepagesize" hugetlb. The default > + value is "max". It exists for all the cgroup except root. > + > + > + hugetlb.<hugepagesize>.max > + Set/show the hard limit of "hugepagesize" hugetlb usage. > + The default value is "max". It exists for all the cgroup except root. > + > + hugetlb.<hugepagesize>.stat > + A read-only flat-keyed file which exists on non-root cgroups. > + > + max_usage > + Maximum HugeTLB usage recorded > + > + failcnt > + The number of allocation failure due to HugeTLB limit > > Misc > ---- > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 53fc34f930d0..1c2bacbca044 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -340,7 +340,8 @@ struct hstate { > unsigned int surplus_huge_pages_node[MAX_NUMNODES]; > #ifdef CONFIG_CGROUP_HUGETLB > /* cgroup control files */ > - struct cftype cgroup_files[5]; > + struct cftype cgroup_files_dfl[5]; > + struct cftype cgroup_files_legacy[5]; > #endif > char name[HSTATE_NAME_LEN]; > }; > diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c > index 2ac38bdc18a1..d4dbf5203324 100644 > --- a/mm/hugetlb_cgroup.c > +++ b/mm/hugetlb_cgroup.c > @@ -283,10 +283,55 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, > } > } > > +static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) > +{ > + int idx; > + u64 val; > + bool write_raw = false; > + struct cftype *cft = seq_cft(seq); > + unsigned long limit; > + struct page_counter *counter; > + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); > + > + idx = MEMFILE_IDX(cft->private); > + counter = &h_cg->hugepage[idx]; > + > + switch (MEMFILE_ATTR(cft->private)) { > + case RES_USAGE: > + val = (u64)page_counter_read(counter); > + break; > + case RES_LIMIT: > + val = (u64)counter->max; > + break; > + case RES_MAX_USAGE: > + val = (u64)counter->watermark; > + break; > + case RES_FAILCNT: > + val = counter->failcnt; > + write_raw = true; > + break; > + default: > + BUG(); > + } > + > + limit = round_down(PAGE_COUNTER_MAX, > + 1 << huge_page_order(&hstates[idx])); > + > + if (val == limit && !write_raw) > + seq_puts(seq, "max\n"); > + else if (write_raw) > + seq_printf(seq, "%llu\n", val); > + else > + seq_printf(seq, "%llu\n", val * PAGE_SIZE); > + > + return 0; > +} > + > static DEFINE_MUTEX(hugetlb_limit_mutex); > > static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, > - char *buf, size_t nbytes, loff_t off) > + char *buf, size_t nbytes, loff_t off, > + const char *max) > { > int ret, idx; > unsigned long nr_pages; > @@ -296,7 +341,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, > return -EINVAL; > > buf = strstrip(buf); > - ret = page_counter_memparse(buf, "-1", &nr_pages); > + ret = page_counter_memparse(buf, max, &nr_pages); > if (ret) > return ret; > > @@ -316,6 +361,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, > return ret ?: nbytes; > } > > +static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, > + char *buf, size_t nbytes, loff_t off) > +{ > + return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); > +} > + > +static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, > + char *buf, size_t nbytes, loff_t off) > +{ > + return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); > +} > + > static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, > char *buf, size_t nbytes, loff_t off) > { > @@ -350,7 +407,60 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) > return buf; > } > > -static void __init __hugetlb_cgroup_file_init(int idx) > +static int hugetlb_stat_show(struct seq_file *seq, void *v) > +{ > + struct page_counter *counter; > + struct cftype *cft = seq_cft(seq); > + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); > + > + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; > + > + seq_printf(seq, "max_usage %llu\n", > + (u64)counter->watermark * PAGE_SIZE); > + seq_printf(seq, "fail_cnt %lu\n", counter->failcnt); > + > + return 0; > +} > + > +static void __init __hugetlb_cgroup_file_dfl_init(int idx) > +{ > + char buf[32]; > + struct cftype *cft; > + struct hstate *h = &hstates[idx]; > + > + /* format the size */ > + mem_fmt(buf, 32, huge_page_size(h)); > + > + /* Add the limit file */ > + cft = &h->cgroup_files_dfl[0]; > + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); > + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); > + cft->seq_show = hugetlb_cgroup_read_u64_max; > + cft->write = hugetlb_cgroup_write_dfl; > + cft->flags = CFTYPE_NOT_ON_ROOT; > + > + /* Add the current usage file */ > + cft = &h->cgroup_files_dfl[1]; > + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); > + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); > + cft->seq_show = hugetlb_cgroup_read_u64_max; > + cft->flags = CFTYPE_NOT_ON_ROOT; > + > + /* Add the stat file */ > + cft = &h->cgroup_files_dfl[2]; > + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.stat", buf); > + cft->seq_show = hugetlb_stat_show; > + cft->flags = CFTYPE_NOT_ON_ROOT; > + > + /* NULL terminate the last cft */ > + cft = &h->cgroup_files_dfl[3]; > + memset(cft, 0, sizeof(*cft)); > + > + WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, > + h->cgroup_files_dfl)); > +} > + > +static void __init __hugetlb_cgroup_file_legacy_init(int idx) > { > char buf[32]; > struct cftype *cft; > @@ -360,38 +470,44 @@ static void __init __hugetlb_cgroup_file_init(int idx) > mem_fmt(buf, 32, huge_page_size(h)); > > /* Add the limit file */ > - cft = &h->cgroup_files[0]; > + cft = &h->cgroup_files_legacy[0]; > snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); > cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); > cft->read_u64 = hugetlb_cgroup_read_u64; > - cft->write = hugetlb_cgroup_write; > + cft->write = hugetlb_cgroup_write_legacy; > > /* Add the usage file */ > - cft = &h->cgroup_files[1]; > + cft = &h->cgroup_files_legacy[1]; > snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); > cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); > cft->read_u64 = hugetlb_cgroup_read_u64; > > /* Add the MAX usage file */ > - cft = &h->cgroup_files[2]; > + cft = &h->cgroup_files_legacy[2]; > snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); > cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); > cft->write = hugetlb_cgroup_reset; > cft->read_u64 = hugetlb_cgroup_read_u64; > > /* Add the failcntfile */ > - cft = &h->cgroup_files[3]; > + cft = &h->cgroup_files_legacy[3]; > snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); > cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); > cft->write = hugetlb_cgroup_reset; > cft->read_u64 = hugetlb_cgroup_read_u64; > > /* NULL terminate the last cft */ > - cft = &h->cgroup_files[4]; > + cft = &h->cgroup_files_legacy[4]; > memset(cft, 0, sizeof(*cft)); > > WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, > - h->cgroup_files)); > + h->cgroup_files_legacy)); > +} > + > +static void __init __hugetlb_cgroup_file_init(int idx) > +{ > + __hugetlb_cgroup_file_dfl_init(idx); > + __hugetlb_cgroup_file_legacy_init(idx); > } > > void __init hugetlb_cgroup_file_init(void) > @@ -433,8 +549,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) > return; > } > > +static struct cftype hugetlb_files[] = { > + {} /* terminate */ > +}; > + > struct cgroup_subsys hugetlb_cgrp_subsys = { > .css_alloc = hugetlb_cgroup_css_alloc, > .css_offline = hugetlb_cgroup_css_offline, > .css_free = hugetlb_cgroup_css_free, > + .dfl_cftypes = hugetlb_files, > + .legacy_cftypes = hugetlb_files, > }; > -- > 2.23.0 >