On 3 February 2016 at 12:59, Juri Lelli <juri.lelli@xxxxxxx> wrote: > To get default values for CPUs capacity we profile a simple (bogus) > integer benchmark on such CPUs; then we normalize results to 1024 > (highest capacity in the system). > > Architectures that want this during boot have to define a weak function > (arch_wants_init_cpu_capacity) to return true. > > Also, kernel has to boot with init_cpu_capacity parameter if profiling > is needed, as it can be expensive and might add ~1 sec to boot time. > > Cc: Russell King <linux@xxxxxxxxxxxxxxxx> > Cc: Catalin Marinas <catalin.marinas@xxxxxxx> > Cc: Will Deacon <will.deacon@xxxxxxx> > Cc: "Rafael J. Wysocki" <rjw@xxxxxxxxxxxxx> > Cc: Viresh Kumar <viresh.kumar@xxxxxxxxxx> > Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx> > Cc: Sudeep Holla <sudeep.holla@xxxxxxx> > Cc: Mark Rutland <mark.rutland@xxxxxxx> > Signed-off-by: Juri Lelli <juri.lelli@xxxxxxx> > --- > Changes since v1: > - add kernel command line parameter to enable profiling > - add define for max trials > > Documentation/kernel-parameters.txt | 4 + > arch/arm/kernel/topology.c | 2 +- > arch/arm64/kernel/topology.c | 12 +++ > drivers/cpufreq/Makefile | 2 +- > drivers/cpufreq/cpufreq.c | 1 + > drivers/cpufreq/cpufreq_capacity.c | 174 ++++++++++++++++++++++++++++++++++++ > include/linux/cpufreq.h | 2 + > 7 files changed, 195 insertions(+), 2 deletions(-) > create mode 100644 drivers/cpufreq/cpufreq_capacity.c > > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt > index 87d40a7..fad2b89 100644 > --- a/Documentation/kernel-parameters.txt > +++ b/Documentation/kernel-parameters.txt > @@ -1570,6 +1570,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. > > initrd= [BOOT] Specify the location of the initial ramdisk > > + init_cpu_capacity > + [KNL,ARM] Enables dynamic CPUs capacity benchmarking > + at boot. > + > inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver > Format: <irq> > > diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c > index ec279d1..c9c87a5 100644 > --- a/arch/arm/kernel/topology.c > +++ b/arch/arm/kernel/topology.c > @@ -47,7 +47,7 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) > return per_cpu(cpu_scale, cpu); > } > > -static void set_capacity_scale(unsigned int cpu, unsigned long capacity) > +void set_capacity_scale(unsigned int cpu, unsigned long capacity) > { > per_cpu(cpu_scale, cpu) = capacity; > } > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c > index 694f6de..3b75d63 100644 > --- a/arch/arm64/kernel/topology.c > +++ b/arch/arm64/kernel/topology.c > @@ -23,6 +23,18 @@ > #include <asm/cputype.h> > #include <asm/topology.h> > > +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; > + > +unsigned long arm_arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) > +{ > + return per_cpu(cpu_scale, cpu); > +} > + > +void set_capacity_scale(unsigned int cpu, unsigned long capacity) > +{ > + per_cpu(cpu_scale, cpu) = capacity; > +} > + > static int __init get_cpu_for_node(struct device_node *node) > { > struct device_node *cpu_node; > diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile > index 9e63fb1..c4025fd 100644 > --- a/drivers/cpufreq/Makefile > +++ b/drivers/cpufreq/Makefile > @@ -1,5 +1,5 @@ > # CPUfreq core > -obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o > +obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o cpufreq_capacity.o Do you really want to have the calibration of capacity dependent of cpufreq ? It means that we can't use it without a cpufreq driver. IMHO, this creates a unnecessary dependency. I understand that you must ensure that core runs at max fequency if a driver is present but you should be able to calibrate the capacity if cpufreq is not available but you have different capacity because micro architecture > > # CPUfreq stats > obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > index e979ec7..b22afe8 100644 > --- a/drivers/cpufreq/cpufreq.c > +++ b/drivers/cpufreq/cpufreq.c > @@ -2440,6 +2440,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) > } > > register_hotcpu_notifier(&cpufreq_cpu_notifier); > + cpufreq_init_cpu_capacity(); > pr_debug("driver %s up and running\n", driver_data->name); > > out: > diff --git a/drivers/cpufreq/cpufreq_capacity.c b/drivers/cpufreq/cpufreq_capacity.c > new file mode 100644 > index 0000000..e54310b > --- /dev/null > +++ b/drivers/cpufreq/cpufreq_capacity.c > @@ -0,0 +1,174 @@ > +/* > + * Default CPU capacity calculation for u-arch invariance > + * > + * Copyright (C) 2015 ARM Ltd. > + * Juri Lelli <juri.lelli@xxxxxxx> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + * This program is distributed "as is" WITHOUT ANY WARRANTY of any > + * kind, whether express or implied; without even the implied warranty > + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + */ > +#include <linux/cpufreq.h> > +#include <linux/sched.h> > + > +#define MAX_TRIALS 10 /* how many times benchmark is executed */ > +static unsigned long long elapsed[NR_CPUS]; > + > +/* > + * Don't let compiler optimize following two functions; we want to avoid any > + * microarchitecture specific optimization that compiler would do and favour > + * one CPU vs. another. Also, my_int_sqrt is cut-and-paste from > + * lib/int_sqrt.c. > + */ > +static unsigned long __attribute__((optimize("O0"))) > +my_int_sqrt(unsigned long x) > +{ > + unsigned long b, m, y = 0; > + > + if (x <= 1) > + return x; > + > + m = 1UL << (BITS_PER_LONG - 2); > + while (m != 0) { > + b = y + m; > + y >>= 1; > + > + if (x >= b) { > + x -= b; > + y += m; > + } > + m >>= 2; > + } > + > + return y; > +} > + > +static unsigned long __attribute__((optimize("O0"))) > +bogus_bench(void) > +{ > + unsigned long i, res; > + > + for (i = 0; i < 100000; i++) > + res = my_int_sqrt(i); > + > + return res; > +} > + > +static int run_bogus_benchmark(int cpu) > +{ > + int ret, trials = MAX_TRIALS; > + u64 begin, end, sample, mean = 0, count = 0; > + unsigned long res; > + > + ret = set_cpus_allowed_ptr(current, cpumask_of(cpu)); > + if (ret) { > + pr_warn("%s: failed to set allowed ptr\n", __func__); > + return -EINVAL; > + } > + > + while (trials--) { > + begin = local_clock(); > + res = bogus_bench(); > + end = local_clock(); > + sample = end - begin; > + > + mean = mean * count + sample; > + mean = div64_u64(mean, ++count); > + pr_debug("%s: cpu=%d begin=%llu end=%llu" > + " sample=%llu mean=%llu count=%llu res=%lu\n", > + __func__, cpu, begin, end, sample, > + mean, count, res); > + } > + elapsed[cpu] = mean; > + > + ret = set_cpus_allowed_ptr(current, cpu_active_mask); > + if (ret) { > + pr_warn("%s: failed to set allowed ptr\n", __func__); > + return -EINVAL; > + } > + > + return 0; > +} > + > +bool __weak arch_wants_init_cpu_capacity(void) > +{ > + return false; > +} > + > +void __weak set_capacity_scale(int cpu, unsigned long capacity) { } > + > +static __read_mostly bool init_cpu_capacity_enabled; > + > +static int __init init_cpu_capacity_setup(char *str) > +{ > + init_cpu_capacity_enabled = true; > + > + return 0; > +} > +early_param("init_cpu_capacity", init_cpu_capacity_setup); > + > +void cpufreq_init_cpu_capacity(void) > +{ > + int cpu, fcpu; > + unsigned long long elapsed_min = ULLONG_MAX; > + unsigned int curr_min, curr_max; > + struct cpufreq_policy *policy; > + > + if (!arch_wants_init_cpu_capacity() || !init_cpu_capacity_enabled) > + return; > + > + for_each_possible_cpu(cpu) { > + policy = cpufreq_cpu_get(cpu); > + if (IS_ERR_OR_NULL(policy)) > + return; > + > + /* > + * We profile only first CPU of each frequency domain; > + * and use that value as capacity of every CPU in the domain. > + */ > + fcpu = cpumask_first(policy->related_cpus); > + if (cpu != fcpu) { > + elapsed[cpu] = elapsed[fcpu]; > + cpufreq_cpu_put(policy); > + continue; > + } > + > + down_write(&policy->rwsem); > + curr_min = policy->user_policy.min; > + curr_max = policy->user_policy.max; > + policy->user_policy.min = policy->cpuinfo.max_freq; > + policy->user_policy.max = policy->cpuinfo.max_freq; > + up_write(&policy->rwsem); > + cpufreq_cpu_put(policy); > + cpufreq_update_policy(cpu); > + > + run_bogus_benchmark(cpu); > + if (elapsed[cpu] < elapsed_min) > + elapsed_min = elapsed[cpu]; > + pr_debug("%s: cpu=%d elapsed=%llu (min=%llu)\n", > + __func__, cpu, elapsed[cpu], elapsed_min); > + > + policy = cpufreq_cpu_get(cpu); > + down_write(&policy->rwsem); > + policy->user_policy.min = curr_min; > + policy->user_policy.max = curr_max; > + up_write(&policy->rwsem); > + cpufreq_cpu_put(policy); > + cpufreq_update_policy(cpu); > + } > + > + for_each_possible_cpu(cpu) { > + unsigned long capacity; > + > + capacity = div64_u64((elapsed_min << 10), elapsed[cpu]); > + pr_debug("%s: CPU%d capacity=%lu\n", __func__, cpu, capacity); > + set_capacity_scale(cpu, capacity); > + } > + > + pr_info("dynamic CPUs capacity installed\n"); > +} > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h > index 88a4215..9924351 100644 > --- a/include/linux/cpufreq.h > +++ b/include/linux/cpufreq.h > @@ -419,6 +419,8 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, > #endif > } > > +void cpufreq_init_cpu_capacity(void); > + > /********************************************************************* > * CPUFREQ GOVERNORS * > *********************************************************************/ > -- > 2.7.0 > -- To unsubscribe from this list: send the line "unsubscribe devicetree" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html