We have the netlink CGROUPSTATS_CMD_GET interface to get taskstats of the cgroup on v1, but haven't the equivalent interface on v2, making it difficult to calculate the per-cgroup cpu load in cadvisor or implement the cgroup proc interface in lxcfs, like /proc/loadavg. Since we already have these counters maintained in psi subsystem, so this patch sum them up and export in the cgroup.stat interface. Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx> --- Documentation/admin-guide/cgroup-v2.rst | 9 +++++++ include/linux/psi.h | 1 + kernel/cgroup/cgroup.c | 3 +++ kernel/sched/psi.c | 34 +++++++++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 64c62b979f2f..4184e749f687 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -923,6 +923,15 @@ All cgroup core files are prefixed with "cgroup." A dying cgroup can consume system resources not exceeding limits, which were active at the moment of cgroup deletion. + nr_iowait_tasks + Total number of tasks in iowait. + + nr_memstall_tasks + Total number of tasks in memstall. + + nr_running_tasks + Total number of runnable tasks. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". diff --git a/include/linux/psi.h b/include/linux/psi.h index 7361023f3fdd..ea98239424ca 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -30,6 +30,7 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); +void psi_taskstat_show(struct seq_file *m, struct cgroup *cgrp); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9153b20e5cc6..2724ae318a3b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3502,6 +3502,9 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); +#ifdef CONFIG_PSI + psi_taskstat_show(seq, cgroup); +#endif return 0; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c0766c..0ae8bd278ca4 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1000,6 +1000,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_taskstat_show(struct seq_file *m, struct cgroup *cgrp) +{ + struct psi_group *group; + int cpu; + int s; + unsigned int taskstat[NR_PSI_TASK_COUNTS - 1] = { 0, }; + + if (static_branch_likely(&psi_disabled)) + return; + + group = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + + for_each_possible_cpu(cpu) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + unsigned int seq; + + do { + seq = read_seqcount_begin(&groupc->seq); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + } while (read_seqcount_retry(&groupc->seq, seq)); + + for (s = 0; s < NR_ONCPU; s++) + taskstat[s] += tasks[s]; + } + + seq_printf(m, "nr_iowait_tasks %u\n" + "nr_memstall_tasks %u\n" + "nr_running_tasks %u\n", + taskstat[NR_IOWAIT], + taskstat[NR_MEMSTALL], + taskstat[NR_RUNNING]); +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) -- 2.25.1