/proc/uptime reports idle time by reading the CPUTIME_IDLE field from the per-cpu kcpustats. However, on NO_HZ systems, idle time is not continually updated on idle cpus, leading this value to appear incorrectly small. /proc/stat performs an accounting update when reading idle time; we can use the same approach for uptime. With this patch, /proc/stat and /proc/uptime now agree on idle time. Additionally, the following shows idle time tick up consistently on an idle machine: (while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}' Reported-by: Luigi Rizzo <lrizzo@xxxxxxxxxx> Signed-off-by: Josh Don <joshdon@xxxxxxxxxx> --- fs/proc/stat.c | 26 -------------------------- fs/proc/uptime.c | 13 ++++++++----- include/linux/kernel_stat.h | 1 + kernel/sched/cputime.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6561a06ef905..99796a8a5223 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -24,16 +24,6 @@ #ifdef arch_idle_time -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 idle; - - idle = kcs->cpustat[CPUTIME_IDLE]; - if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) - idle += arch_idle_time(cpu); - return idle; -} - static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait; @@ -46,22 +36,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) #else -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 idle, idle_usecs = -1ULL; - - if (cpu_online(cpu)) - idle_usecs = get_cpu_idle_time_us(cpu, NULL); - - if (idle_usecs == -1ULL) - /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ - idle = kcs->cpustat[CPUTIME_IDLE]; - else - idle = idle_usecs * NSEC_PER_USEC; - - return idle; -} - static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 5a1b228964fb..c900f354ef93 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -12,18 +12,21 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec64 uptime; struct timespec64 idle; - u64 nsec; + const struct kernel_cpustat *kcs; + u64 idle_nsec; u32 rem; int i; - nsec = 0; - for_each_possible_cpu(i) - nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + idle_nsec = 0; + for_each_possible_cpu(i) { + kcs = &kcpustat_cpu(i); + idle_nsec += get_idle_time(kcs, i); + } ktime_get_boottime_ts64(&uptime); timens_add_boottime(&uptime); - idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44ae1a7eb9e3..9a5f5c6239c7 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64, enum cpu_usage_stat); extern void account_steal_time(u64); extern void account_idle_time(u64); +extern u64 get_idle_time(const struct kernel_cpustat *kcs, int cpu); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..9d7629e21164 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -227,6 +227,34 @@ void account_idle_time(u64 cputime) cpustat[CPUTIME_IDLE] += cputime; } +/* + * Returns the total idle time for the given cpu. + * @kcs: The kernel_cpustat for the desired cpu. + * @cpu: The desired cpu. + */ +u64 get_idle_time(const struct kernel_cpustat *kcs, int cpu) +{ + u64 idle; + u64 __maybe_unused idle_usecs = -1ULL; + +#ifdef arch_idle_time + idle = kcs->cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); +#else + if (cpu_online(cpu)) + idle_usecs = get_cpu_idle_time_us(cpu, NULL); + + if (idle_usecs == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ + idle = kcs->cpustat[CPUTIME_IDLE]; + else + idle = idle_usecs * NSEC_PER_USEC; +#endif + + return idle; +} + /* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on -- 2.33.0.rc1.237.g0d66db33f3-goog