Hi Arjan, Kevin Hilman <khilman@xxxxxx> writes: > From: Nicole Chalhoub <n-chalhoub@xxxxxx> > > While there is CPU load, program a C-state specific one-shot timer in > order to give CPUidle another opportunity to pick a deeper C-state > instead of spending potentially long idle times in a shallow C-state. Any comments on this? This is an implementation of an idea proposed by you on our previous attempt[1] to do something similar. Thanks, Kevin [1] http://lkml.org/lkml/2011/4/7/155 > > Long winded version: > When going idle with a high load average, CPUidle menu governor will > decide to pick a shallow C-state since one of the guiding principles > of the menu governor is "The busier the system, the less impact of > C-states is acceptable" (taken from cpuidle/governors/menu.c.) > That makes perfect sense. > > However, there are missed power-saving opportunities for bursty > workloads with long idle times (e.g. MP3 playback.) Given such a > workload, because of the load average, CPUidle tends to pick a shallow > C-state. Because we also go tickless, this shallow C-state is used > for the duration of the idle period. If the idle period is long, a > deeper C state would've resulted in better power savings. > This patch provides an additional opportuntity for CPUidle to pick a > deeper C-state by programming a timer (with a C-state specific timeout) > such that the CPUidle governor will have another opportunity to pick a > deeper C-state. > > Adding this timer for C-state reevaluation improved the load estimation > on our ARM/OMAP4 platform and increased the time spent in deep C-states > (~50% of idle time in C-states deeper than C1). A power saving of ~10mA > at battery level is observed during MP3 playback on OMAP4/Blaze board. > > Signed-off-by: Nicole Chalhoub <n-chalhoub@xxxxxx> > Signed-off-by: Kevin Hilman <khilman@xxxxxx> > --- > drivers/cpuidle/cpuidle.c | 28 +++++++++++++++++++++++++- > drivers/cpuidle/governors/menu.c | 39 ++++++++++++++++++++++++++++++++----- > include/linux/cpuidle.h | 4 +++ > 3 files changed, 63 insertions(+), 8 deletions(-) > > diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c > index 1994885..4b1ac0c 100644 > --- a/drivers/cpuidle/cpuidle.c > +++ b/drivers/cpuidle/cpuidle.c > @@ -92,13 +92,33 @@ static void cpuidle_idle_call(void) > target_state->time += (unsigned long long)dev->last_residency; > target_state->usage++; > > - /* give the governor an opportunity to reflect on the outcome */ > - if (cpuidle_curr_governor->reflect) > + hrtimer_cancel(&dev->cstate_timer); > + > + /* > + * Give the governor an opportunity to reflect on the outcome > + * Do not take into account the wakeups due to the hrtimer, they > + * should not impact the predicted idle time. > + */ > + if ((!dev->hrtimer_expired) && cpuidle_curr_governor->reflect) > cpuidle_curr_governor->reflect(dev); > trace_power_end(0); > } > > /** > + * cstate_reassessment_timer - interrupt handler of the cstate hrtimer > + * @handle: the expired hrtimer > + */ > +static enum hrtimer_restart cstate_reassessment_timer(struct hrtimer *handle) > +{ > + struct cpuidle_device *data = > + container_of(handle, struct cpuidle_device, cstate_timer); > + > + data->hrtimer_expired = 1; > + > + return HRTIMER_NORESTART; > +} > + > +/** > * cpuidle_install_idle_handler - installs the cpuidle idle loop handler > */ > void cpuidle_install_idle_handler(void) > @@ -185,6 +205,10 @@ int cpuidle_enable_device(struct cpuidle_device *dev) > > dev->enabled = 1; > > + dev->hrtimer_expired = 0; > + hrtimer_init(&dev->cstate_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + dev->cstate_timer.function = cstate_reassessment_timer; > + > enabled_devices++; > return 0; > > diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c > index 1b12870..fd54584 100644 > --- a/drivers/cpuidle/governors/menu.c > +++ b/drivers/cpuidle/governors/menu.c > @@ -125,10 +125,21 @@ struct menu_device { > #define LOAD_INT(x) ((x) >> FSHIFT) > #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) > > -static int get_loadavg(void) > +static int get_loadavg(struct cpuidle_device *dev) > { > - unsigned long this = this_cpu_load(); > + unsigned long this; > > + /* > + * this_cpu_load() returns the value of rq->load.weight > + * at the previous scheduler tick and not the current value. > + * If the timer expired, that means we are in idle,there > + * are no more runnable processes in the current queue > + * =>return the current value of rq->load.weight which is 0. > + */ > + if (dev->hrtimer_expired == 1) > + return 0; > + else > + this = this_cpu_load(); > > return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10; > } > @@ -166,13 +177,13 @@ static inline int which_bucket(unsigned int duration) > * to be, the higher this multiplier, and thus the higher > * the barrier to go to an expensive C state. > */ > -static inline int performance_multiplier(void) > +static inline int performance_multiplier(struct cpuidle_device *dev) > { > int mult = 1; > > /* for higher loadavg, we are more reluctant */ > > - mult += 2 * get_loadavg(); > + mult += 2 * get_loadavg(dev); > > /* for IO wait tasks (per cpu!) we add 5x each */ > mult += 10 * nr_iowait_cpu(smp_processor_id()); > @@ -236,6 +247,7 @@ static int menu_select(struct cpuidle_device *dev) > int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); > int i; > int multiplier; > + ktime_t timeout; > > if (data->needs_update) { > menu_update(dev); > @@ -256,7 +268,7 @@ static int menu_select(struct cpuidle_device *dev) > > data->bucket = which_bucket(data->expected_us); > > - multiplier = performance_multiplier(); > + multiplier = performance_multiplier(dev); > > /* > * if the correction factor is 0 (eg first time init or cpu hotplug > @@ -287,12 +299,27 @@ static int menu_select(struct cpuidle_device *dev) > break; > if (s->exit_latency > latency_req) > break; > - if (s->exit_latency * multiplier > data->predicted_us) > + if (s->exit_latency * multiplier > data->predicted_us) { > + /* > + * Could not enter the next C-state because of a high > + * load. Set a timer in order to check the load again > + * after the timeout expires and re-evaluate cstate. > + */ > + if (s->hrtimer_timeout != 0 && get_loadavg(dev)) { > + timeout = > + ktime_set(0, > + s->hrtimer_timeout * NSEC_PER_USEC); > + hrtimer_start(&dev->cstate_timer, timeout, > + HRTIMER_MODE_REL); > + } > break; > + } > data->exit_us = s->exit_latency; > data->last_state_idx = i; > } > > + /* Reset hrtimer_expired which is set when the hrtimer fires */ > + dev->hrtimer_expired = 0; > return data->last_state_idx; > } > > diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h > index 55215cc..8d11b52 100644 > --- a/include/linux/cpuidle.h > +++ b/include/linux/cpuidle.h > @@ -16,6 +16,7 @@ > #include <linux/module.h> > #include <linux/kobject.h> > #include <linux/completion.h> > +#include <linux/hrtimer.h> > > #define CPUIDLE_STATE_MAX 8 > #define CPUIDLE_NAME_LEN 16 > @@ -37,6 +38,7 @@ struct cpuidle_state { > unsigned int exit_latency; /* in US */ > unsigned int power_usage; /* in mW */ > unsigned int target_residency; /* in US */ > + unsigned int hrtimer_timeout; /* in US */ > > unsigned long long usage; > unsigned long long time; /* in US */ > @@ -97,6 +99,8 @@ struct cpuidle_device { > struct completion kobj_unregister; > void *governor_data; > struct cpuidle_state *safe_state; > + struct hrtimer cstate_timer; > + unsigned int hrtimer_expired; > }; > > DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); -- To unsubscribe from this list: send the line "unsubscribe linux-omap" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html