As conservative is based off ondemand the codebases occasionally need to be resync'd. This patch, although ugly, does this. Signed-off-by: Alexander Clouter <alex@xxxxxxxxxxxxx> --- drivers/cpufreq/cpufreq_conservative.c | 328 ++++++++++++++++++-------------- 1 files changed, 188 insertions(+), 140 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index a16a5b8..c9bd0c5 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -13,22 +13,17 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/smp.h> #include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/ctype.h> #include <linux/cpufreq.h> -#include <linux/sysctl.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/sysfs.h> #include <linux/cpu.h> -#include <linux/kmod.h> -#include <linux/workqueue.h> #include <linux/jiffies.h> #include <linux/kernel_stat.h> -#include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/ktime.h> +#include <linux/sched.h> + /* * dbs is used in this file as a shortform for demandbased switching * It helps to keep variable names smaller, simpler @@ -43,14 +38,14 @@ * latency of the processor. The governor will work on any processor with * transition latency <= 10mS, using appropriate sampling * rate. - * For CPUs with transition latency > 10mS (mostly drivers - * with CPUFREQ_ETERNAL), this governor will not work. + * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL) + * this governor will not work. * All times here are in uS. */ static unsigned int def_sampling_rate; #define MIN_SAMPLING_RATE_RATIO (2) /* for correct statistics, we need at least 10 ticks between each measure */ -#define MIN_STAT_SAMPLING_RATE \ +#define MIN_STAT_SAMPLING_RATE \ (MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10)) #define MIN_SAMPLING_RATE \ (def_sampling_rate / MIN_SAMPLING_RATE_RATIO) @@ -75,12 +70,15 @@ static unsigned int minimum_sampling_rate(void) static void do_dbs_timer(struct work_struct *work); struct cpu_dbs_info_s { + cputime64_t prev_cpu_idle; + cputime64_t prev_cpu_wall; + cputime64_t prev_cpu_nice; struct cpufreq_policy *cur_policy; - unsigned int prev_cpu_idle_up; - unsigned int prev_cpu_idle_down; - unsigned int enable; + struct delayed_work work; unsigned int down_skip; unsigned int requested_freq; + int cpu; + unsigned int enable:1; }; static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); @@ -95,18 +93,17 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */ * is recursive for the same process. -Venki */ static DEFINE_MUTEX(dbs_mutex); -static DECLARE_DELAYED_WORK(dbs_work, do_dbs_timer); -struct dbs_tuners { +static struct workqueue_struct *kconservative_wq; + +static struct dbs_tuners { unsigned int sampling_rate; unsigned int sampling_down_factor; unsigned int up_threshold; unsigned int down_threshold; unsigned int ignore_nice; unsigned int freq_step; -}; - -static struct dbs_tuners dbs_tuners_ins = { +} dbs_tuners_ins = { .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, .down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD, .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, @@ -114,18 +111,37 @@ static struct dbs_tuners dbs_tuners_ins = { .freq_step = 5, }; -static inline unsigned int get_cpu_idle_time(unsigned int cpu) +static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, + cputime64_t *wall) { - unsigned int add_nice = 0, ret; + cputime64_t idle_time; + cputime64_t cur_wall_time; + cputime64_t busy_time; + + cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); + busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, + kstat_cpu(cpu).cpustat.system); - if (dbs_tuners_ins.ignore_nice) - add_nice = kstat_cpu(cpu).cpustat.nice; + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); - ret = kstat_cpu(cpu).cpustat.idle + - kstat_cpu(cpu).cpustat.iowait + - add_nice; + idle_time = cputime64_sub(cur_wall_time, busy_time); + if (wall) + *wall = cur_wall_time; - return ret; + return idle_time; +} + +static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, wall); + + if (idle_time == -1ULL) + return get_cpu_idle_time_jiffy(cpu, wall); + + return idle_time; } /* keep track of frequency transitions */ @@ -186,8 +202,8 @@ static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf) return sprintf(buf, "%u\n", MIN_SAMPLING_RATE); } -#define define_one_ro(_name) \ -static struct freq_attr _name = \ +#define define_one_ro(_name) \ +static struct freq_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) define_one_ro(sampling_rate_max); @@ -213,6 +229,7 @@ static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused, unsigned int input; int ret; ret = sscanf(buf, "%u", &input); + if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) return -EINVAL; @@ -230,11 +247,10 @@ static ssize_t store_sampling_rate(struct cpufreq_policy *unused, int ret; ret = sscanf(buf, "%u", &input); - mutex_lock(&dbs_mutex); - if (ret != 1) { - mutex_unlock(&dbs_mutex); + if (ret != 1) return -EINVAL; - } + + mutex_lock(&dbs_mutex); dbs_tuners_ins.sampling_rate = max(input, minimum_sampling_rate()); mutex_unlock(&dbs_mutex); @@ -250,7 +266,7 @@ static ssize_t store_up_threshold(struct cpufreq_policy *unused, mutex_lock(&dbs_mutex); if (ret != 1 || input > 100 || - input <= dbs_tuners_ins.down_threshold) { + input <= dbs_tuners_ins.down_threshold) { mutex_unlock(&dbs_mutex); return -EINVAL; } @@ -269,7 +285,9 @@ static ssize_t store_down_threshold(struct cpufreq_policy *unused, ret = sscanf(buf, "%u", &input); mutex_lock(&dbs_mutex); - if (ret != 1 || input > 100 || input >= dbs_tuners_ins.up_threshold) { + /* cannot be lower than 11 otherwise freq will not fall */ + if (ret != 1 || input < 11 || input > 100 || + input >= dbs_tuners_ins.up_threshold) { mutex_unlock(&dbs_mutex); return -EINVAL; } @@ -302,12 +320,14 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, } dbs_tuners_ins.ignore_nice = input; - /* we need to re-evaluate prev_cpu_idle_up and prev_cpu_idle_down */ + /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { - struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); - j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j); - j_dbs_info->prev_cpu_idle_down = j_dbs_info->prev_cpu_idle_up; + struct cpu_dbs_info_s *dbs_info; + dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info->prev_cpu_idle = get_cpu_idle_time(j, + &dbs_info->prev_cpu_wall); + if (dbs_tuners_ins.ignore_nice) + dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; } mutex_unlock(&dbs_mutex); @@ -319,7 +339,6 @@ static ssize_t store_freq_step(struct cpufreq_policy *policy, { unsigned int input; int ret; - ret = sscanf(buf, "%u", &input); if (ret != 1) @@ -367,55 +386,78 @@ static struct attribute_group dbs_attr_group = { /************************** sysfs end ************************/ -static void dbs_check_cpu(int cpu) +static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) { - unsigned int idle_ticks, up_idle_ticks, down_idle_ticks; - unsigned int tmp_idle_ticks, total_idle_ticks; + unsigned int load = 0; unsigned int freq_target; - unsigned int freq_down_sampling_rate; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, cpu); - struct cpufreq_policy *policy; - if (!this_dbs_info->enable) - return; + struct cpufreq_policy *policy; + unsigned int j; policy = this_dbs_info->cur_policy; /* - * The default safe range is 20% to 80% - * Every sampling_rate, we check - * - If current idle time is less than 20%, then we try to - * increase frequency - * Every sampling_rate*sampling_down_factor, we check - * - If current idle time is more than 80%, then we try to - * decrease frequency + * Every sampling_rate, we check, if current idle time is less + * than 20% (default), then we try to increase frequency + * Every sampling_rate*sampling_down_factor, we check, if current + * idle time is more than 80%, then we try to decrease frequency * * Any frequency increase takes it to the maximum frequency. * Frequency reduction happens at minimum steps of - * 5% (default) of max_frequency + * 5% (default) of maximum frequency */ - /* Check for frequency increase */ - idle_ticks = UINT_MAX; + /* Get Absolute Load */ + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_info_s *j_dbs_info; + cputime64_t cur_wall_time, cur_idle_time; + unsigned int idle_time, wall_time; - /* Check for frequency increase */ - total_idle_ticks = get_cpu_idle_time(cpu); - tmp_idle_ticks = total_idle_ticks - - this_dbs_info->prev_cpu_idle_up; - this_dbs_info->prev_cpu_idle_up = total_idle_ticks; + j_dbs_info = &per_cpu(cpu_dbs_info, j); + + cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); + + wall_time = (unsigned int) cputime64_sub(cur_wall_time, + j_dbs_info->prev_cpu_wall); + j_dbs_info->prev_cpu_wall = cur_wall_time; - if (tmp_idle_ticks < idle_ticks) - idle_ticks = tmp_idle_ticks; + idle_time = (unsigned int) cputime64_sub(cur_idle_time, + j_dbs_info->prev_cpu_idle); + j_dbs_info->prev_cpu_idle = cur_idle_time; - /* Scale idle ticks by 100 and compare with up and down ticks */ - idle_ticks *= 100; - up_idle_ticks = (100 - dbs_tuners_ins.up_threshold) * - usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + if (dbs_tuners_ins.ignore_nice) { + cputime64_t cur_nice; + unsigned long cur_nice_jiffies; + + cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, + j_dbs_info->prev_cpu_nice); + /* + * Assumption: nice time between sampling periods will + * be less than 2^32 jiffies for 32 bit sys + */ + cur_nice_jiffies = (unsigned long) + cputime64_to_jiffies64(cur_nice); + + j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + idle_time += jiffies_to_usecs(cur_nice_jiffies); + } + + if (unlikely(!wall_time || wall_time < idle_time)) + continue; + + load = 100 * (wall_time - idle_time) / wall_time; + } + + /* + * break out if we 'cannot' reduce the speed as the user might + * want freq_step to be zero + */ + if (dbs_tuners_ins.freq_step == 0) + return; - if (idle_ticks < up_idle_ticks) { + /* Check for frequency increase */ + if (load > dbs_tuners_ins.up_threshold) { this_dbs_info->down_skip = 0; - this_dbs_info->prev_cpu_idle_down = - this_dbs_info->prev_cpu_idle_up; /* if we are already at full speed then break out early */ if (this_dbs_info->requested_freq == policy->max) @@ -436,49 +478,24 @@ static void dbs_check_cpu(int cpu) return; } - /* Check for frequency decrease */ - this_dbs_info->down_skip++; - if (this_dbs_info->down_skip < dbs_tuners_ins.sampling_down_factor) - return; - - /* Check for frequency decrease */ - total_idle_ticks = this_dbs_info->prev_cpu_idle_up; - tmp_idle_ticks = total_idle_ticks - - this_dbs_info->prev_cpu_idle_down; - this_dbs_info->prev_cpu_idle_down = total_idle_ticks; - - if (tmp_idle_ticks < idle_ticks) - idle_ticks = tmp_idle_ticks; - - /* Scale idle ticks by 100 and compare with up and down ticks */ - idle_ticks *= 100; - this_dbs_info->down_skip = 0; - - freq_down_sampling_rate = dbs_tuners_ins.sampling_rate * - dbs_tuners_ins.sampling_down_factor; - down_idle_ticks = (100 - dbs_tuners_ins.down_threshold) * - usecs_to_jiffies(freq_down_sampling_rate); - - if (idle_ticks > down_idle_ticks) { - /* - * if we are already at the lowest speed then break out early - * or if we 'cannot' reduce the speed as the user might want - * freq_target to be zero - */ - if (this_dbs_info->requested_freq == policy->min - || dbs_tuners_ins.freq_step == 0) - return; - + /* + * The optimal frequency is the frequency that is the lowest that + * can support the current CPU usage without triggering the up + * policy. To be safe, we focus 10 points under the threshold. + */ + if (load < (dbs_tuners_ins.down_threshold - 10)) { freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100; - /* max freq cannot be less than 100. But who knows.... */ - if (unlikely(freq_target == 0)) - freq_target = 5; - this_dbs_info->requested_freq -= freq_target; if (this_dbs_info->requested_freq < policy->min) this_dbs_info->requested_freq = policy->min; + /* + * if we cannot reduce the frequency anymore, break out early + */ + if (policy->cur == policy->min) + return; + __cpufreq_driver_target(policy, this_dbs_info->requested_freq, CPUFREQ_RELATION_H); return; @@ -487,27 +504,45 @@ static void dbs_check_cpu(int cpu) static void do_dbs_timer(struct work_struct *work) { - int i; - mutex_lock(&dbs_mutex); - for_each_online_cpu(i) - dbs_check_cpu(i); - schedule_delayed_work(&dbs_work, - usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); - mutex_unlock(&dbs_mutex); + struct cpu_dbs_info_s *dbs_info = + container_of(work, struct cpu_dbs_info_s, work.work); + unsigned int cpu = dbs_info->cpu; + + /* We want all CPUs to do sampling nearly on same jiffy */ + int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + + delay -= jiffies % delay; + + if (lock_policy_rwsem_write(cpu) < 0) + return; + + if (!dbs_info->enable) { + unlock_policy_rwsem_write(cpu); + return; + } + + dbs_check_cpu(dbs_info); + + queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay); + unlock_policy_rwsem_write(cpu); } -static inline void dbs_timer_init(void) +static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) { - init_timer_deferrable(&dbs_work.timer); - schedule_delayed_work(&dbs_work, - usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); - return; + /* We want all CPUs to do sampling nearly on same jiffy */ + int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + delay -= jiffies % delay; + + dbs_info->enable = 1; + INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); + queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work, + delay); } -static inline void dbs_timer_exit(void) +static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { - cancel_delayed_work(&dbs_work); - return; + dbs_info->enable = 0; + cancel_delayed_work(&dbs_info->work); } static int cpufreq_governor_dbs(struct cpufreq_policy *policy, @@ -541,11 +576,13 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info = &per_cpu(cpu_dbs_info, j); j_dbs_info->cur_policy = policy; - j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(cpu); - j_dbs_info->prev_cpu_idle_down - = j_dbs_info->prev_cpu_idle_up; + j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, + &j_dbs_info->prev_cpu_wall); + if (dbs_tuners_ins.ignore_nice) { + j_dbs_info->prev_cpu_nice = + kstat_cpu(j).cpustat.nice; + } } - this_dbs_info->enable = 1; this_dbs_info->down_skip = 0; this_dbs_info->requested_freq = policy->cur; @@ -567,30 +604,30 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_tuners_ins.sampling_rate = def_sampling_rate; - dbs_timer_init(); cpufreq_register_notifier( &dbs_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } + dbs_timer_init(this_dbs_info); mutex_unlock(&dbs_mutex); + break; case CPUFREQ_GOV_STOP: mutex_lock(&dbs_mutex); - this_dbs_info->enable = 0; + dbs_timer_exit(this_dbs_info); sysfs_remove_group(&policy->kobj, &dbs_attr_group); dbs_enable--; + /* * Stop the timerschedule work, when this governor * is used for first time */ - if (dbs_enable == 0) { - dbs_timer_exit(); + if (dbs_enable == 0) cpufreq_unregister_notifier( &dbs_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - } mutex_unlock(&dbs_mutex); @@ -607,6 +644,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, this_dbs_info->cur_policy, policy->min, CPUFREQ_RELATION_L); mutex_unlock(&dbs_mutex); + break; } return 0; @@ -624,15 +662,25 @@ struct cpufreq_governor cpufreq_gov_conservative = { static int __init cpufreq_gov_dbs_init(void) { - return cpufreq_register_governor(&cpufreq_gov_conservative); + int err; + + kconservative_wq = create_workqueue("kconservative"); + if (!kconservative_wq) { + printk(KERN_ERR "Creation of kconservative failed\n"); + return -EFAULT; + } + + err = cpufreq_register_governor(&cpufreq_gov_conservative); + if (err) + destroy_workqueue(kconservative_wq); + + return err; } static void __exit cpufreq_gov_dbs_exit(void) { - /* Make sure that the scheduled work is indeed not running */ - flush_scheduled_work(); - cpufreq_unregister_governor(&cpufreq_gov_conservative); + destroy_workqueue(kconservative_wq); } -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe cpufreq" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html