On Thursday, February 24, 2022 9:08:30 AM CET Feng Tang wrote: > On Wed, Feb 23, 2022 at 03:23:20PM +0100, Rafael J. Wysocki wrote: > > On Wed, Feb 23, 2022 at 1:40 AM Feng Tang <feng.tang@xxxxxxxxx> wrote: > > > > > > On Tue, Feb 22, 2022 at 04:32:29PM -0800, srinivas pandruvada wrote: > > > > Hi Doug, > > > > > > > > On Tue, 2022-02-22 at 16:07 -0800, Doug Smythies wrote: > > > > > Hi All, > > > > > > > > > > I am about 1/2 way through testing Feng's "hacky debug patch", > > > > > let me know if I am wasting my time, and I'll abort. So far, it > > > > > works fine. > > > > This just proves that if you add some callback during long idle, you > > > > will reach a less aggressive p-state. I think you already proved that > > > > with your results below showing 1W less average power ("Kernel 5.17-rc3 > > > > + Feng patch (6 samples at 300 sec per"). > > > > > > > > Rafael replied with one possible option. Alternatively when planing to > > > > enter deep idle, set P-state to min with a callback like we do in > > > > offline callback. > > > > > > Yes, if the system is going to idle, it makes sense to goto a lower > > > cpufreq first (also what my debug patch will essentially lead to). > > > > > > Given cprfreq-util's normal running frequency is every 10ms, doing > > > this before entering idle is not a big extra burden. > > > > But this is not related to idle as such, but to the fact that idle > > sometimes stops the scheduler tick which otherwise would run the > > cpufreq governor callback on a regular basis. > > > > It is stopping the tick that gets us into trouble, so I would avoid > > doing it if the current performance state is too aggressive. > > I've tried to simulate Doug's environment by using his kconfig, and > offline my 36 CPUs Desktop to leave 12 CPUs online, and on it I can > still see Local timer interrupts when there is no active load, with > the longest interval between 2 timer interrupts is 4 seconds, while > idle class's task_tick_idle() will do nothing, and CFS' > task_tick_fair() will in turn call cfs_rq_util_change() > > I searched the cfs/deadline/rt code, these three classes all have > places to call cpufreq_update_util(), either in enqueue/dequeue or > changing running bandwidth. So I think entering idle also means the > system load is under a big change, and worth calling the cpufreq > callback. > > > In principle, PM QoS can be used for that from intel_pstate, but there > > is a problem with that approach, because it is not obvious what value > > to give to it and it is not always guaranteed to work (say when all of > > the C-states except for C1 are disabled). > > > > So it looks like a new mechanism is needed for that. > > If you think idle class is not the right place to solve it, I can > also help testing new patches. So I have the appended experimental patch to address this issue that's not been tested at all. Caveat emptor. I've been looking for something relatively low-overhead and taking all of the dependencies into account. --- drivers/cpufreq/intel_pstate.c | 40 ++++++++++++++++++++++++++++--------- drivers/cpuidle/governors/ladder.c | 6 +++-- drivers/cpuidle/governors/menu.c | 2 + drivers/cpuidle/governors/teo.c | 3 ++ include/linux/cpuidle.h | 4 +++ 5 files changed, 44 insertions(+), 11 deletions(-) Index: linux-pm/drivers/cpuidle/governors/menu.c =================================================================== --- linux-pm.orig/drivers/cpuidle/governors/menu.c +++ linux-pm/drivers/cpuidle/governors/menu.c @@ -284,6 +284,8 @@ static int menu_select(struct cpuidle_dr if (unlikely(delta < 0)) { delta = 0; delta_tick = 0; + } else if (dev->retain_tick) { + delta = delta_tick; } data->next_timer_ns = delta; Index: linux-pm/drivers/cpuidle/governors/teo.c =================================================================== --- linux-pm.orig/drivers/cpuidle/governors/teo.c +++ linux-pm/drivers/cpuidle/governors/teo.c @@ -308,6 +308,9 @@ static int teo_select(struct cpuidle_dri cpu_data->time_span_ns = local_clock(); duration_ns = tick_nohz_get_sleep_length(&delta_tick); + if (dev->retain_tick) + duration_ns = delta_tick; + cpu_data->sleep_length_ns = duration_ns; /* Check if there is any choice in the first place. */ Index: linux-pm/include/linux/cpuidle.h =================================================================== --- linux-pm.orig/include/linux/cpuidle.h +++ linux-pm/include/linux/cpuidle.h @@ -93,6 +93,7 @@ struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; unsigned int poll_time_limit:1; + unsigned int retain_tick:1; unsigned int cpu; ktime_t next_hrtimer; @@ -172,6 +173,8 @@ extern int cpuidle_play_dead(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); static inline struct cpuidle_device *cpuidle_get_device(void) {return __this_cpu_read(cpuidle_devices); } + +extern void cpuidle_update_retain_tick(bool val); #else static inline void disable_cpuidle(void) { } static inline bool cpuidle_not_available(struct cpuidle_driver *drv, @@ -211,6 +214,7 @@ static inline int cpuidle_play_dead(void static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } +static inline void cpuidle_update_retain_tick(bool val) { } #endif #ifdef CONFIG_CPU_IDLE Index: linux-pm/drivers/cpufreq/intel_pstate.c =================================================================== --- linux-pm.orig/drivers/cpufreq/intel_pstate.c +++ linux-pm/drivers/cpufreq/intel_pstate.c @@ -19,6 +19,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/cpufreq.h> +#include <linux/cpuidle.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/fs.h> @@ -1970,6 +1971,30 @@ static inline void intel_pstate_cppc_set } #endif /* CONFIG_ACPI_CPPC_LIB */ +static void intel_pstate_update_perf_ctl(struct cpudata *cpu) +{ + int pstate = cpu->pstate.current_pstate; + + /* + * Avoid stopping the scheduler tick from cpuidle on CPUs in turbo + * P-states to prevent them from getting back to the high frequency + * right away after getting out of deep idle. + */ + cpuidle_update_retain_tick(pstate > cpu->pstate.max_pstate); + wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); +} + +static void intel_pstate_update_perf_ctl_wrapper(void *cpu_data) +{ + intel_pstate_update_perf_ctl(cpu_data); +} + +static void intel_pstate_update_perf_ctl_on_cpu(struct cpudata *cpu) +{ + smp_call_function_single(cpu->cpu, intel_pstate_update_perf_ctl_wrapper, + cpu, 1); +} + static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) { trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); @@ -1979,8 +2004,7 @@ static void intel_pstate_set_pstate(stru * the CPU being updated, so force the register update to run on the * right CPU. */ - wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, - pstate_funcs.get_val(cpu, pstate)); + intel_pstate_update_perf_ctl_on_cpu(cpu); } static void intel_pstate_set_min_pstate(struct cpudata *cpu) @@ -2256,7 +2280,7 @@ static void intel_pstate_update_pstate(s return; cpu->pstate.current_pstate = pstate; - wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); + intel_pstate_update_perf_ctl(cpu); } static void intel_pstate_adjust_pstate(struct cpudata *cpu) @@ -2843,11 +2867,9 @@ static void intel_cpufreq_perf_ctl_updat u32 target_pstate, bool fast_switch) { if (fast_switch) - wrmsrl(MSR_IA32_PERF_CTL, - pstate_funcs.get_val(cpu, target_pstate)); + intel_pstate_update_perf_ctl(cpu); else - wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, - pstate_funcs.get_val(cpu, target_pstate)); + intel_pstate_update_perf_ctl_on_cpu(cpu); } static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, @@ -2857,6 +2879,8 @@ static int intel_cpufreq_update_pstate(s int old_pstate = cpu->pstate.current_pstate; target_pstate = intel_pstate_prepare_request(cpu, target_pstate); + cpu->pstate.current_pstate = target_pstate; + if (hwp_active) { int max_pstate = policy->strict_target ? target_pstate : cpu->max_perf_ratio; @@ -2867,8 +2891,6 @@ static int intel_cpufreq_update_pstate(s intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch); } - cpu->pstate.current_pstate = target_pstate; - intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH : INTEL_PSTATE_TRACE_TARGET, old_pstate); Index: linux-pm/drivers/cpuidle/governors/ladder.c =================================================================== --- linux-pm.orig/drivers/cpuidle/governors/ladder.c +++ linux-pm/drivers/cpuidle/governors/ladder.c @@ -61,10 +61,10 @@ static inline void ladder_do_selection(s * ladder_select_state - selects the next state to enter * @drv: cpuidle driver * @dev: the CPU - * @dummy: not used + * @stop_tick: Whether or not to stop the scheduler tick */ static int ladder_select_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev, bool *dummy) + struct cpuidle_device *dev, bool *stop_tick) { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct ladder_device_state *last_state; @@ -73,6 +73,8 @@ static int ladder_select_state(struct cp s64 latency_req = cpuidle_governor_latency_req(dev->cpu); s64 last_residency; + *stop_tick = !dev->retain_tick; + /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) { ladder_do_selection(dev, ldev, last_idx, 0);