Re: [PATCH v3] cpufreq: ondemand: handle SW coordinated CPUs

"Rafael J. Wysocki" <rjw@xxxxxxx> · Thu, 22 Nov 2012 01:10:15 +0100

On Tuesday, November 20, 2012 01:06:16 PM Fabio Baltieri wrote:
> From: Rickard Andersson <rickard.andersson@xxxxxxxxxxxxxx>
> 
> This patch fixes a bug that occurred when we had load on a secondary CPU
> and the primary CPU was sleeping. Only one sampling timer was spawned
> and it was spawned as a deferred timer on the primary CPU, so when a
> secondary CPU had a change in load this was not detected by the ondemand
> governor.
> 
> This patch make sure that deferred timers are run on all CPUs in the
> case of software controlled CPUs that run on the same frequency.

While I basically don't have problems with the functionality of this,
I have some with the code organization.

> Signed-off-by: Rickard Andersson <rickard.andersson@xxxxxxxxxxxxxx>
> Signed-off-by: Fabio Baltieri <fabio.baltieri@xxxxxxxxxx>
> ---
>  drivers/cpufreq/cpufreq_ondemand.c | 141 ++++++++++++++++++++++++++++++++-----
>  1 file changed, 122 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index 396322f..430f614 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -93,6 +93,7 @@ struct cpu_dbs_info_s {
>  	 * when user is changing the governor or limits.
>  	 */
>  	struct mutex timer_mutex;
> +	ktime_t time_stamp;
>  };
>  static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
>  
> @@ -285,7 +286,7 @@ static void update_sampling_rate(unsigned int new_rate)
>  		policy = cpufreq_cpu_get(cpu);
>  		if (!policy)
>  			continue;
> -		dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
> +		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>  		cpufreq_cpu_put(policy);
>  
>  		mutex_lock(&dbs_info->timer_mutex);
> @@ -305,7 +306,7 @@ static void update_sampling_rate(unsigned int new_rate)
>  			cancel_delayed_work_sync(&dbs_info->work);
>  			mutex_lock(&dbs_info->timer_mutex);
>  
> -			schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work,
> +			schedule_delayed_work_on(cpu, &dbs_info->work,
>  						 usecs_to_jiffies(new_rate));
>  
>  		}

The above two changes don't belong to this patch.  Please send a separate patch
with them and a matching description in the changelog.

> @@ -449,6 +450,16 @@ static struct attribute_group dbs_attr_group = {
>  
>  /************************** sysfs end ************************/
>  
> +static bool dbs_sw_coordinated_cpus(struct cpu_dbs_info_s *dbs_info)
> +{
> +	struct cpufreq_policy *policy = dbs_info->cur_policy;
> +
> +	if (cpumask_weight(policy->cpus) > 1)
> +		return true;
> +	else
> +		return false;
> +}

return cpumask_weight(policy->cpus) > 1;

pretty please.

> +
>  static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq)
>  {
>  	if (dbs_tuners_ins.powersave_bias)
> @@ -598,20 +609,41 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
>  
>  static void do_dbs_timer(struct work_struct *work)
>  {
> +	struct delayed_work *dw = to_delayed_work(work);
>  	struct cpu_dbs_info_s *dbs_info =
>  		container_of(work, struct cpu_dbs_info_s, work.work);
> -	unsigned int cpu = dbs_info->cpu;
> -	int sample_type = dbs_info->sample_type;
> -
> +	int sample_type;
>  	int delay;
> +	bool sample = true;
> +
> +	if (dbs_sw_coordinated_cpus(dbs_info)) {
> +		ktime_t time_now;
> +		s64 delta_us;
> +
> +		/* use leader CPU's dbs_info */
> +		dbs_info = &per_cpu(od_cpu_dbs_info, dbs_info->cpu);
> +		mutex_lock(&dbs_info->timer_mutex);
>  
> -	mutex_lock(&dbs_info->timer_mutex);
> +		time_now = ktime_get();
> +		delta_us = ktime_us_delta(time_now, dbs_info->time_stamp);
> +
> +		/* Do nothing if we recently have sampled */
> +		if (delta_us < (s64)(dbs_tuners_ins.sampling_rate / 2))
> +			sample = false;
> +		else
> +			dbs_info->time_stamp = time_now;
> +	} else {
> +		mutex_lock(&dbs_info->timer_mutex);
> +	}

Please don't handle locking this way.  Instead, please move the code you'll
run under the lock in both cases into a separate function (it may take "sample"
as an argument along with dbs_info) and call it between
mutex_lock() and mutex_unlock() in each block.

In addition to that, I'd move the whole block executed when
dbs_sw_coordinated_cpus(dbs_info) is true into a separate function where
dbs_info would be a local variable.  This way it wouldn't mix two distinct
cases in the same piece of code that's remarkably hard to follow.

> +
> +	sample_type = dbs_info->sample_type;
>  
>  	/* Common NORMAL_SAMPLE setup */
>  	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
>  	if (!dbs_tuners_ins.powersave_bias ||
>  	    sample_type == DBS_NORMAL_SAMPLE) {
> -		dbs_check_cpu(dbs_info);
> +		if (sample)
> +			dbs_check_cpu(dbs_info);
>  		if (dbs_info->freq_lo) {
>  			/* Setup timer for SUB_SAMPLE */
>  			dbs_info->sample_type = DBS_SUB_SAMPLE;
> @@ -627,32 +659,41 @@ static void do_dbs_timer(struct work_struct *work)
>  				delay -= jiffies % delay;
>  		}
>  	} else {
> -		__cpufreq_driver_target(dbs_info->cur_policy,
> -			dbs_info->freq_lo, CPUFREQ_RELATION_H);
> +		if (sample)
> +			__cpufreq_driver_target(dbs_info->cur_policy,
> +						dbs_info->freq_lo,
> +						CPUFREQ_RELATION_H);
>  		delay = dbs_info->freq_lo_jiffies;
>  	}
> -	schedule_delayed_work_on(cpu, &dbs_info->work, delay);
> +	schedule_delayed_work_on(smp_processor_id(), dw, delay);

We're not supposed to be using smp_processor_id() any more.
get_cpu()/put_cpu() should be used instead.

>  	mutex_unlock(&dbs_info->timer_mutex);
>  }
>  
> -static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
> +static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info, int cpu)
>  {
>  	/* We want all CPUs to do sampling nearly on same jiffy */
>  	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
> +	struct cpu_dbs_info_s *dbs_info_local = &per_cpu(od_cpu_dbs_info, cpu);
>  
>  	if (num_online_cpus() > 1)
>  		delay -= jiffies % delay;
>  
> +	cancel_delayed_work_sync(&dbs_info_local->work);
>  	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
> -	INIT_DEFERRABLE_WORK(&dbs_info->work, do_dbs_timer);
> -	schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, delay);
> +	schedule_delayed_work_on(cpu, &dbs_info_local->work, delay);
>  }
>  
> -static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
> +static inline void dbs_timer_exit(int cpu)
>  {
> +	struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>  	cancel_delayed_work_sync(&dbs_info->work);
>  }
>  
> +static void dbs_timer_exit_per_cpu(struct work_struct *dummy)
> +{
> +	dbs_timer_exit(smp_processor_id());
> +}
> +
>  /*
>   * Not all CPUs want IO time to be accounted as busy; this dependson how
>   * efficient idling at a higher frequency/voltage is.
> @@ -676,6 +717,43 @@ static int should_io_be_busy(void)
>  	return 0;
>  }
>  
> +static int __cpuinit cpu_callback(struct notifier_block *nfb,
> +				  unsigned long action, void *hcpu)
> +{
> +	unsigned int cpu = (unsigned long)hcpu;
> +	struct device *cpu_dev;
> +	struct cpu_dbs_info_s *dbs_info;
> +
> +	dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
> +
> +	/* use leader CPU's dbs_info */
> +	if (dbs_sw_coordinated_cpus(dbs_info))
> +		dbs_info = &per_cpu(od_cpu_dbs_info, dbs_info->cpu);
> +
> +	cpu_dev = get_cpu_device(cpu);
> +	if (cpu_dev) {
> +		switch (action) {
> +		case CPU_ONLINE:
> +		case CPU_ONLINE_FROZEN:
> +			dbs_timer_init(dbs_info, cpu);
> +			break;
> +		case CPU_DOWN_PREPARE:
> +		case CPU_DOWN_PREPARE_FROZEN:
> +			dbs_timer_exit(cpu);
> +			break;
> +		case CPU_DOWN_FAILED:
> +		case CPU_DOWN_FAILED_FROZEN:
> +			dbs_timer_init(dbs_info, cpu);

Why don't you merge this with the CPU_ONLINE* cases?

> +			break;
> +		}
> +	}
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block __refdata ondemand_cpu_notifier = {
> +	.notifier_call = cpu_callback,
> +};
> +
>  static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
>  				   unsigned int event)
>  {
> @@ -704,9 +782,13 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
>  			if (dbs_tuners_ins.ignore_nice)
>  				j_dbs_info->prev_cpu_nice =
>  						kcpustat_cpu(j).cpustat[CPUTIME_NICE];
> +
> +			mutex_init(&j_dbs_info->timer_mutex);
> +			INIT_DEFERRABLE_WORK(&j_dbs_info->work, do_dbs_timer);
> +
> +			j_dbs_info->rate_mult = 1;
>  		}
>  		this_dbs_info->cpu = cpu;
> -		this_dbs_info->rate_mult = 1;
>  		ondemand_powersave_bias_init_cpu(cpu);
>  		/*
>  		 * Start the timerschedule work, when this governor
> @@ -736,21 +818,42 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
>  		}
>  		mutex_unlock(&dbs_mutex);
>  
> -		mutex_init(&this_dbs_info->timer_mutex);
> -		dbs_timer_init(this_dbs_info);
> +		/* If SW coordinated CPUs then register notifier */
> +		if (dbs_sw_coordinated_cpus(this_dbs_info)) {
> +			register_hotcpu_notifier(&ondemand_cpu_notifier);
> +
> +			/* Initiate timer time stamp */
> +			this_dbs_info->time_stamp = ktime_get();
> +
> +			for_each_cpu(j, policy->cpus) {
> +				struct cpu_dbs_info_s *j_dbs_info;
> +
> +				j_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
> +				dbs_timer_init(j_dbs_info, j);
> +			}
> +		} else {
> +			dbs_timer_init(this_dbs_info, cpu);
> +		}
>  		break;
>  
>  	case CPUFREQ_GOV_STOP:
> -		dbs_timer_exit(this_dbs_info);
> +		dbs_timer_exit(cpu);
>  
>  		mutex_lock(&dbs_mutex);
>  		mutex_destroy(&this_dbs_info->timer_mutex);
>  		dbs_enable--;
>  		mutex_unlock(&dbs_mutex);
> -		if (!dbs_enable)
> +		if (!dbs_enable) {
>  			sysfs_remove_group(cpufreq_global_kobject,
>  					   &dbs_attr_group);
>  
> +			if (dbs_sw_coordinated_cpus(this_dbs_info)) {
> +				/* Make sure all pending timers/works are
> +				 * stopped. */
> +				schedule_on_each_cpu(dbs_timer_exit_per_cpu);
> +				unregister_hotcpu_notifier(&ondemand_cpu_notifier);
> +			}
> +		}
>  		break;
>  
>  	case CPUFREQ_GOV_LIMITS:

Thanks,
Rafael

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe cpufreq" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html