The "some" field of cpu.pressure indicator may lose the insight into how severely one cgroup is stalled on certain cpu, because PSI tracks stall time for each cpu through: tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) which turns nr_delayed_tasks[cpu] into boolean value. So together with this cgroup level run_delay accounting, the scheduling info of cgroups will be better illustrated. Currently the task and cpu level accounting have already been tracked through the following two holders respectively: struct task_struct::sched_info if SCHED_INFO struct rq::rq_sched_info if SCHEDSTATS When extending this to cgroups, the minimal requirement would be: root: relies on rq::rq_sched_info, hence SCHEDSTATS non-root: relies on task's, hence SCHED_INFO It might be too demanding to require both, while collecting data for root cgroup from different holders according to different configs would also be confusing and error-prone. In order to keep things simple, let us rely on the cputime infrastructure to do the accounting as the other cputimes do. Only cgroup v2 is supported and CONFIG_SCHED_INFO is required. Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx> --- include/linux/cgroup-defs.h | 3 +++ include/linux/kernel_stat.h | 7 +++++++ kernel/cgroup/rstat.c | 25 +++++++++++++++++++++++++ kernel/sched/cputime.c | 10 ++++++++++ kernel/sched/stats.h | 3 +++ 5 files changed, 48 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..287366e60414 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -328,6 +328,9 @@ struct cgroup_base_stat { u64 forceidle_sum; #endif u64 ntime; +#ifdef CONFIG_SCHED_INFO + u64 run_delay; +#endif }; /* diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index b97ce2df376f..ddd59fea10ad 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -29,6 +29,9 @@ enum cpu_usage_stat { CPUTIME_GUEST_NICE, #ifdef CONFIG_SCHED_CORE CPUTIME_FORCEIDLE, +#endif +#ifdef CONFIG_SCHED_INFO + CPUTIME_RUN_DELAY, #endif NR_STATS, }; @@ -141,4 +144,8 @@ extern void account_idle_ticks(unsigned long ticks); extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); #endif +#ifdef CONFIG_SCHED_INFO +extern void account_run_delay_time(struct task_struct *tsk, u64 delta); +#endif + #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index c2784c317cdd..53984cdf7f9b 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif dst_bstat->ntime += src_bstat->ntime; +#ifdef CONFIG_SCHED_INFO + dst_bstat->run_delay += src_bstat->run_delay; +#endif } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif dst_bstat->ntime -= src_bstat->ntime; +#ifdef CONFIG_SCHED_INFO + dst_bstat->run_delay -= src_bstat->run_delay; +#endif } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, case CPUTIME_FORCEIDLE: rstatc->bstat.forceidle_sum += delta_exec; break; +#endif +#ifdef CONFIG_SCHED_INFO + case CPUTIME_RUN_DELAY: + rstatc->bstat.run_delay += delta_exec; + break; #endif default: break; @@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif bstat->ntime += cpustat[CPUTIME_NICE]; +#ifdef CONFIG_SCHED_INFO + bstat->run_delay += cpustat[CPUTIME_RUN_DELAY]; +#endif } } @@ -610,6 +624,16 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat #endif } +static void cgroup_run_delay_show(struct seq_file *seq, struct cgroup_base_stat *bstat) +{ +#ifdef CONFIG_SCHED_INFO + u64 run_delay = bstat->run_delay; + + do_div(run_delay, NSEC_PER_USEC); + seq_printf(seq, "run_delay_usec %llu\n", run_delay); +#endif +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; @@ -640,6 +664,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) bstat.ntime); cgroup_force_idle_show(seq, &bstat); + cgroup_run_delay_show(seq, &bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5d9143dd0879..42af602c10a6 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -243,6 +243,16 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) } #endif +#ifdef CONFIG_SCHED_INFO +/* + * Account for run_delay time spent waiting in rq. + */ +void account_run_delay_time(struct task_struct *p, u64 delta) +{ + task_group_account_field(p, CPUTIME_RUN_DELAY, delta); +} +#endif + /* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 19cdbe96f93d..fdfd04a89b05 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -252,7 +252,9 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) t->sched_info.max_run_delay = delta; if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; + rq_sched_info_dequeue(rq, delta); + account_run_delay_time(t, delta); } /* @@ -279,6 +281,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.min_run_delay = delta; rq_sched_info_arrive(rq, delta); + account_run_delay_time(t, delta); } /* -- 2.37.3