On Tue, 31 Jan 2023 15:44:00 +0100 Thomas Gleixner <tglx@xxxxxxxxxxxxx> > > Seriously this procfs accuracy is the least of the problems and if this > would be the only issue then we could trivially fix it by declaring that > the procfs output might go backwards. It's an estimate after all. If > there would be a real reason to ensure monotonicity there then we could > easily do that in the readout code. > > But the real issue is that both get_cpu_idle_time_us() and > get_cpu_iowait_time_us() can invoke update_ts_time_stats() which is way > worse than the above procfs idle time going backwards. > > If update_ts_time_stats() is invoked concurrently for the same CPU then > ts->idle_sleeptime and ts->iowait_sleeptime are turning into random > numbers. > > This has been broken 12 years ago in commit 595aac488b54 ("sched: > Introduce a function to update the idle statistics"). [...] > > P.S.: I hate the spinlock in the idle code path, but I don't have a > better idea. Provided the percpu rule is enforced, the random numbers mentioned above could be erased without another spinlock added. Hillf +++ b/kernel/time/tick-sched.c @@ -640,13 +640,26 @@ static void tick_nohz_update_jiffies(kti /* * Updates the per-CPU time idle statistics counters */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +static u64 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, + int io, u64 *last_update_time) { ktime_t delta; + if (last_update_time) + *last_update_time = ktime_to_us(now); + if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); + + /* update is only expected on the local CPU */ + if (cpu != smp_processor_id()) { + if (io) + delta = ktime_add(ts->iowait_sleeptime, delta); + else + delta = ktime_add(ts->idle_sleeptime, delta); + return ktime_to_us(delta); + } + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else @@ -654,14 +667,12 @@ update_ts_time_stats(int cpu, struct tic ts->idle_entrytime = now; } - if (last_update_time) - *last_update_time = ktime_to_us(now); - + return 0; } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { - update_ts_time_stats(smp_processor_id(), ts, now, NULL); + update_ts_time_stats(smp_processor_id(), ts, now, 0, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(); @@ -698,7 +709,9 @@ u64 get_cpu_idle_time_us(int cpu, u64 *l now = ktime_get(); if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); + u64 ret = update_ts_time_stats(cpu, ts, now, 0, last_update_time); + if (ret) + return ret; idle = ts->idle_sleeptime; } else { if (ts->idle_active && !nr_iowait_cpu(cpu)) { @@ -739,7 +752,9 @@ u64 get_cpu_iowait_time_us(int cpu, u64 now = ktime_get(); if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); + u64 ret = update_ts_time_stats(cpu, ts, now, 1, last_update_time); + if (ret) + return ret; iowait = ts->iowait_sleeptime; } else { if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {