The per-task and per-cpu accounting have already been tracked by t->sched_info.run_delay and rq->rq_sched_info.run_delay respectively. Extends this to also include cgroups. The PSI indicator, "some" of cpu.pressure, loses the insight into how severely that cgroup is stalled. Say 100 tasks or just 1 task that gets stalled at a certain point will show no difference in "some" pressure. IOW "some" is a flat value that not weighted by the severity (e.g. # of tasks). Only cgroup v2 is supported. Similar to the task accounting, the cgroup accounting requires that CONFIG_SCHED_INFO is enabled. Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx> --- Documentation/admin-guide/cgroup-v2.rst | 1 + include/linux/cgroup-defs.h | 3 +++ include/linux/kernel_stat.h | 14 ++++++++++++++ kernel/cgroup/rstat.c | 17 +++++++++++++++++ kernel/sched/cputime.c | 12 ++++++++++++ kernel/sched/stats.h | 2 ++ 6 files changed, 49 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 315ede811c9d..440c3800c49c 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1100,6 +1100,7 @@ All time durations are in microseconds. - usage_usec - user_usec - system_usec + - run_delay_usec (requires CONFIG_SCHED_INFO) and the following five when the controller is enabled: diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..287366e60414 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -328,6 +328,9 @@ struct cgroup_base_stat { u64 forceidle_sum; #endif u64 ntime; +#ifdef CONFIG_SCHED_INFO + u64 run_delay; +#endif }; /* diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index b97ce2df376f..256b1a55de62 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -31,6 +31,15 @@ enum cpu_usage_stat { CPUTIME_FORCEIDLE, #endif NR_STATS, + +#ifdef CONFIG_SCHED_INFO + /* + * Instead of cputime, run_delay is tracked through + * sched_info by task and rq, so there is no need to + * enlarge the cpustat[] array. + */ + CPUTIME_RUN_DELAY, +#endif }; struct kernel_cpustat { @@ -141,4 +150,9 @@ extern void account_idle_ticks(unsigned long ticks); extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); #endif +#ifdef CONFIG_SCHED_INFO +extern void account_run_delay_time(struct task_struct *tsk, u64 delta); +extern u64 get_cpu_run_delay(int cpu); +#endif + #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index dc6acab00d69..c7f9397a714e 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif dst_bstat->ntime += src_bstat->ntime; +#ifdef CONFIG_SCHED_INFO + dst_bstat->run_delay += src_bstat->run_delay; +#endif } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif dst_bstat->ntime -= src_bstat->ntime; +#ifdef CONFIG_SCHED_INFO + dst_bstat->run_delay -= src_bstat->run_delay; +#endif } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, case CPUTIME_FORCEIDLE: rstatc->bstat.forceidle_sum += delta_exec; break; +#endif +#ifdef CONFIG_SCHED_INFO + case CPUTIME_RUN_DELAY: + rstatc->bstat.run_delay += delta_exec; + break; #endif default: break; @@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif bstat->ntime += cpustat[CPUTIME_NICE]; +#ifdef CONFIG_SCHED_INFO + bstat->run_delay += get_cpu_run_delay(i); +#endif } } @@ -611,6 +625,9 @@ static struct bstat_entry { BSTAT_ENTRY("nice_usec", ntime), #ifdef CONFIG_SCHED_CORE BSTAT_ENTRY("core_sched.force_idle_usec", forceidle_sum), +#endif +#ifdef CONFIG_SCHED_INFO + BSTAT_ENTRY("run_delay_usec", run_delay), #endif { NULL } /* must be at end */ #undef BSTAT_ENTRY diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5d9143dd0879..e6be57cdb54e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -243,6 +243,18 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) } #endif +#ifdef CONFIG_SCHED_INFO +void account_run_delay_time(struct task_struct *p, u64 delta) +{ + cgroup_account_cputime_field(p, CPUTIME_RUN_DELAY, delta); +} + +u64 get_cpu_run_delay(int cpu) +{ + return cpu_rq(cpu)->rq_sched_info.run_delay; +} +#endif + /* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 6ade91bce63e..b21a2c4b9c54 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -249,6 +249,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; + account_run_delay_time(t, delta); rq_sched_info_dequeue(rq, delta); } @@ -271,6 +272,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; + account_run_delay_time(t, delta); rq_sched_info_arrive(rq, delta); } -- 2.37.3