cpu.stat displays global metrics, such as cgroup usage. It would also be useful to be able to break these down by cpu; to that end, this patch adds a new interface, 'cpu.stat_percpu', to display the percpu values of these stats. Each line of the output corresponds to a particular metric. The format of each line is the name of the metric, followed by space delimited percpu values. The reason for this approach (vs having each line correspond to a particular cpu) is to make it easier to display extra subsystem-specific percpu fields. Signed-off-by: Josh Don <joshdon@xxxxxxxxxx> --- include/linux/cgroup-defs.h | 5 + kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup.c | 10 ++ kernel/cgroup/rstat.c | 159 ++++++++++++++++++++++++++++---- 4 files changed, 155 insertions(+), 20 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index db2e147e069f..7778a011f457 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -461,6 +461,11 @@ struct cgroup { struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ + /* Per-cpu basic resource statistics. These are NULL on root. */ + struct cgroup_base_stat __percpu *bstat_cpu; + struct cgroup_base_stat __percpu *last_bstat_cpu; + struct prev_cputime __percpu *prev_cputime_cpu; + /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..07e932c4f875 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -254,6 +254,7 @@ int cgroup_rstat_init(struct cgroup *cgrp); void cgroup_rstat_exit(struct cgroup *cgrp); void cgroup_rstat_boot(void); void cgroup_base_stat_cputime_show(struct seq_file *seq); +void cgroup_base_stat_percpu_cputime_show(struct seq_file *seq); /* * namespace.c diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..4f5ddce529eb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3604,6 +3604,12 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +static int cpu_stat_percpu_show(struct seq_file *seq, void *v) +{ + cgroup_base_stat_percpu_cputime_show(seq); + return 0; +} + #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { @@ -5014,6 +5020,10 @@ static struct cftype cgroup_base_files[] = { .name = "cpu.stat", .seq_show = cpu_stat_show, }, + { + .name = "cpu.stat_percpu", + .seq_show = cpu_stat_percpu_show, + }, #ifdef CONFIG_PSI { .name = "io.pressure", diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..1af37333e5bf 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -253,7 +253,19 @@ int cgroup_rstat_init(struct cgroup *cgrp) if (!cgrp->rstat_cpu) { cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); if (!cgrp->rstat_cpu) - return -ENOMEM; + goto error_nomem; + + cgrp->last_bstat_cpu = alloc_percpu(struct cgroup_base_stat); + if (!cgrp->last_bstat_cpu) + goto error_nomem; + + cgrp->bstat_cpu = alloc_percpu(struct cgroup_base_stat); + if (!cgrp->bstat_cpu) + goto error_nomem; + + cgrp->prev_cputime_cpu = alloc_percpu(struct prev_cputime); + if (!cgrp->prev_cputime_cpu) + goto error_nomem; } /* ->updated_children list is self terminated */ @@ -265,6 +277,21 @@ int cgroup_rstat_init(struct cgroup *cgrp) } return 0; + +error_nomem: + free_percpu(cgrp->rstat_cpu); + cgrp->rstat_cpu = NULL; + + free_percpu(cgrp->last_bstat_cpu); + cgrp->last_bstat_cpu = NULL; + + free_percpu(cgrp->bstat_cpu); + cgrp->bstat_cpu = NULL; + + free_percpu(cgrp->prev_cputime_cpu); + cgrp->prev_cputime_cpu = NULL; + + return -ENOMEM; } void cgroup_rstat_exit(struct cgroup *cgrp) @@ -284,6 +311,12 @@ void cgroup_rstat_exit(struct cgroup *cgrp) free_percpu(cgrp->rstat_cpu); cgrp->rstat_cpu = NULL; + free_percpu(cgrp->last_bstat_cpu); + cgrp->last_bstat_cpu = NULL; + free_percpu(cgrp->bstat_cpu); + cgrp->bstat_cpu = NULL; + free_percpu(cgrp->prev_cputime_cpu); + cgrp->prev_cputime_cpu = NULL; } void __init cgroup_rstat_boot(void) @@ -319,22 +352,29 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; + struct cgroup_base_stat *bstat_cpu, *last_bstat_cpu; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ if (!parent) return; + /* these are not present on root */ + bstat_cpu = per_cpu_ptr(cgrp->bstat_cpu, cpu); + last_bstat_cpu = per_cpu_ptr(cgrp->last_bstat_cpu, cpu); + /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + /* propagate percpu delta to global */ delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(bstat_cpu, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); /* propagate global delta to parent (unless that's root) */ @@ -343,6 +383,11 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = *bstat_cpu; + cgroup_base_stat_sub(&delta, last_bstat_cpu); + cgroup_base_stat_add(per_cpu_ptr(parent->bstat_cpu, cpu), &delta); + cgroup_base_stat_add(last_bstat_cpu, &delta); } } @@ -400,6 +445,30 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } +/* See root_cgroup_cputime. Note that this does not first reset cputime. */ +static void root_cgroup_add_cputime_cpu(struct task_cputime *cputime, int cpu) +{ + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u64 user = 0; + u64 sys = 0; + + kcpustat_cpu_fetch(&kcpustat, cpu); + + user += cpustat[CPUTIME_USER]; + user += cpustat[CPUTIME_NICE]; + cputime->utime += user; + + sys += cpustat[CPUTIME_SYSTEM]; + sys += cpustat[CPUTIME_IRQ]; + sys += cpustat[CPUTIME_SOFTIRQ]; + cputime->stime += sys; + + cputime->sum_exec_runtime += user; + cputime->sum_exec_runtime += sys; + cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; +} + /* * compute the cputime for the root cgroup by getting the per cpu data * at a global level, then categorizing the fields in a manner consistent @@ -414,25 +483,7 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->utime = 0; cputime->sum_exec_runtime = 0; for_each_possible_cpu(i) { - struct kernel_cpustat kcpustat; - u64 *cpustat = kcpustat.cpustat; - u64 user = 0; - u64 sys = 0; - - kcpustat_cpu_fetch(&kcpustat, i); - - user += cpustat[CPUTIME_USER]; - user += cpustat[CPUTIME_NICE]; - cputime->utime += user; - - sys += cpustat[CPUTIME_SYSTEM]; - sys += cpustat[CPUTIME_IRQ]; - sys += cpustat[CPUTIME_SOFTIRQ]; - cputime->stime += sys; - - cputime->sum_exec_runtime += user; - cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + root_cgroup_add_cputime_cpu(cputime, i); } } @@ -464,3 +515,71 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) "system_usec %llu\n", usage, utime, stime); } + +void cgroup_base_stat_percpu_cputime_show(struct seq_file *seq) +{ + static DEFINE_MUTEX(mutex); + static DEFINE_PER_CPU(struct cgroup_base_stat, cached_percpu_stats); + struct cgroup_base_stat *cached_bstat; + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 val; + int cpu; + + /* protects cached_percpu_stats */ + mutex_lock(&mutex); + + if (cgroup_parent(cgrp)) { + struct cgroup_base_stat *bstat_cpu; + + cgroup_rstat_flush_hold(cgrp); + + for_each_possible_cpu(cpu) { + bstat_cpu = per_cpu_ptr(cgrp->bstat_cpu, cpu); + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu); + + cached_bstat->cputime.sum_exec_runtime = + bstat_cpu->cputime.sum_exec_runtime; + cputime_adjust(&bstat_cpu->cputime, + per_cpu_ptr(cgrp->prev_cputime_cpu, cpu), + &cached_bstat->cputime.utime, + &cached_bstat->cputime.stime); + } + + cgroup_rstat_flush_release(); + } else { + for_each_possible_cpu(cpu) { + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu); + memset(cached_bstat, 0, sizeof(*cached_bstat)); + root_cgroup_add_cputime_cpu(&cached_bstat->cputime, cpu); + } + } + + seq_puts(seq, "usage_usec"); + for_each_possible_cpu(cpu) { + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu); + val = cached_bstat->cputime.sum_exec_runtime; + do_div(val, NSEC_PER_USEC); + seq_printf(seq, " %llu", val); + } + seq_puts(seq, "\n"); + + seq_puts(seq, "user_usec"); + for_each_possible_cpu(cpu) { + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu); + val = cached_bstat->cputime.utime; + do_div(val, NSEC_PER_USEC); + seq_printf(seq, " %llu", val); + } + seq_puts(seq, "\n"); + + seq_puts(seq, "system_usec"); + for_each_possible_cpu(cpu) { + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu); + val = cached_bstat->cputime.stime; + do_div(val, NSEC_PER_USEC); + seq_printf(seq, " %llu", val); + } + seq_puts(seq, "\n"); + + mutex_unlock(&mutex); +} -- 2.34.1.575.g55b058a8bb-goog