Re: [v4 RFC PATCH 4/4] timers: logic to move non pinned timers

Thomas Gleixner <tglx@xxxxxxxxxxxxx> · Fri, 3 Apr 2009 23:52:51 +0200 (CEST)

Arun,

On Wed, 1 Apr 2009, Arun R Bharadwaj wrote:
> @@ -627,6 +628,12 @@ __mod_timer(struct timer_list *timer, un
>  
>  	new_base = __get_cpu_var(tvec_bases);
>  
> +	current_cpu = smp_processor_id();
> +	preferred_cpu = get_nohz_load_balancer();
> +	if (get_sysctl_timer_migration() && idle_cpu(current_cpu)
> +			&& !pinned && preferred_cpu != -1)

Can we please check the preliminaries first to avoid the atomic_read ?

    cpu = smp_processor_id();
    if (!pinned && idle_cpu() && get_sysctl_timer_migration()) {
       newcpu = get_nohz_load_balancer();
       if (newcpu >= 0)
       	  cpu = newcpu;   
    }

    new_base = per_cpu(tvec_bases, cpu);

> +		new_base = per_cpu(tvec_bases, preferred_cpu);
> +
>  	if (base != new_base) {
>  		/*
>  		 * We are trying to schedule the timer on the local CPU.
> @@ -635,7 +642,8 @@ __mod_timer(struct timer_list *timer, un
>  		 * handler yet has not finished. This also guarantees that
>  		 * the timer is serialized wrt itself.
>  		 */
> -		if (likely(base->running_timer != timer)) {
> +		if (likely(base->running_timer != timer) ||
> +				get_sysctl_timer_migration()) {

  No, that's wrong. We can not migrate a running timer ever.

>  			/* See the comment in lock_timer_base() */
>  			timer_set_base(timer, NULL);
>  			spin_unlock(&base->lock);
> @@ -1063,10 +1071,9 @@ cascade:
>   * Check, if the next hrtimer event is before the next timer wheel
>   * event:
>   */
> -static unsigned long cmp_next_hrtimer_event(unsigned long now,
> -					    unsigned long expires)
> +static unsigned long __cmp_next_hrtimer_event(unsigned long now,
> +			unsigned long expires, ktime_t hr_delta)
>  {
> -	ktime_t hr_delta = hrtimer_get_next_event();
>  	struct timespec tsdelta;
>  	unsigned long delta;
>  
> @@ -1103,24 +1110,59 @@ static unsigned long cmp_next_hrtimer_ev
>  	return expires;
>  }
>  
> +static unsigned long cmp_next_hrtimer_event(unsigned long now,
> +					unsigned long expires)
> +{
> +	 ktime_t hr_delta = hrtimer_get_next_event();
> +	 return __cmp_next_hrtimer_event(now, expires, hr_delta);
> +}
> +
> +static unsigned long cmp_next_hrtimer_event_on(unsigned long now,
> +					unsigned long expires, int cpu)
> +{
> +	 ktime_t hr_delta = hrtimer_get_next_event_on(cpu);
> +	 return __cmp_next_hrtimer_event(now, expires, hr_delta);
> +}
> +
> +unsigned long __get_next_timer_interrupt(unsigned long now, int cpu)
> +{
> +	struct tvec_base *base = per_cpu(tvec_bases, cpu);
> +	unsigned long expires;
> +
> +	spin_lock(&base->lock);
> +	expires = __next_timer_interrupt(base);
> +	spin_unlock(&base->lock);
> +	return expires;
> +}
> +
>  /**
>   * get_next_timer_interrupt - return the jiffy of the next pending timer
>   * @now: current time (in jiffies)
>   */
>  unsigned long get_next_timer_interrupt(unsigned long now)
>  {
> -	struct tvec_base *base = __get_cpu_var(tvec_bases);
>  	unsigned long expires;
> +	int cpu = smp_processor_id();
>  
> -	spin_lock(&base->lock);
> -	expires = __next_timer_interrupt(base);
> -	spin_unlock(&base->lock);
> +	expires = __get_next_timer_interrupt(now, cpu);
>  
>  	if (time_before_eq(expires, now))
>  		return now;
>  
>  	return cmp_next_hrtimer_event(now, expires);
>  }
> +
> +unsigned long get_next_timer_interrupt_on(unsigned long now, int cpu)
> +{
> +	unsigned long expires;
> +
> +	expires = __get_next_timer_interrupt(now, cpu);
> +
> +	if (time_before_eq(expires, now))
> +		return now;
> +
> +	return cmp_next_hrtimer_event_on(now, expires, cpu);
> +}
>  #endif

   What's the purpose of all those changes ? Just for the latency check ?
   See below.

>  /*
> Index: linux.trees.git/kernel/hrtimer.c
> ===================================================================
> --- linux.trees.git.orig/kernel/hrtimer.c
> +++ linux.trees.git/kernel/hrtimer.c
> @@ -43,6 +43,8 @@
>  #include <linux/seq_file.h>
>  #include <linux/err.h>
>  #include <linux/debugobjects.h>
> +#include <linux/sched.h>
> +#include <linux/timer.h>
>  
>  #include <asm/uaccess.h>
>  
> @@ -198,8 +200,16 @@ switch_hrtimer_base(struct hrtimer *time
>  {
>  	struct hrtimer_clock_base *new_base;
>  	struct hrtimer_cpu_base *new_cpu_base;
> +	int current_cpu, preferred_cpu;
> +
> +	current_cpu = smp_processor_id();
> +	preferred_cpu = get_nohz_load_balancer();
> +	if (get_sysctl_timer_migration() && !pinned && preferred_cpu != -1 &&
> +			check_hrtimer_latency(timer, preferred_cpu))

  Comments from timer.c __mod_timer() apply here as well.

  You have a cpu_idle check in timer.c, why not here ?

  This check_hrtimer_latency business is ugly and I think we should
  try to solve this different.

...
  int cpu, tocpu = -1;

  cpu = smp_processor_id();
  if (preliminaries_for_migration) {
     tocpu = get_nohz_load_balancer();
     if (tocpu >= 0)
     	cpu = tocpu;
  }

again:
  new_cpu_base = &per_cpu(hrtimer_bases, cpu);
  new_base = &new_cpu_base->clock_base[base->index];

  if (base != new_base) {

     if (unlikely(hrtimer_callback_running(timer)))
     	return base;

     timer->base = NULL;
     spin_unlock(&base->cpu_base->lock);
     spin_lock(&new_base->cpu_base->lock);
     timer->base = new_base;

     if (cpu == tocpu) {
     	/* Calc clock monotonic expiry time */  	
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);

	/*
	 * Get the next event on the target cpu from the clock events layer. This
	 * covers the highres=off nohz=on case as well.
	 */
	ktime_t next = clockevents_get_next_event(cpu);

	ktime_t delta = ktime_sub(expires, next);

	/*
	 * We do not migrate the timer when it is expiring before the next
	 * event on the target cpu because we can not reprogram the target
	 * cpu timer hardware and we would cause it to fire late.
	 */
	if (delta.tv64 < 0) {
	   cpu = smp_processor_id();
	   goto again;
     	}

	/*
	 * We might add a check here which does not migrate the timer
     	 * when it's expiry is very close, but that needs to be evaluated
	 * if it's really a problem. Again we can ask the clock events layer
         * here when the next tick timer is due and compare against it to
         * avoid an extra ktime_get() call.
	 * Probably it's not a problem as a possible wakeup of some task will
	 * push that task anyway to the preferred cpu, but we'll see.
         */
     } 
   }

That way we avoid the whole poking in the timer wheel and adding /
modifying functions all over the place for no real value.

The information we are looking for is already there.

Thanks,

	tglx
_______________________________________________
linux-pm mailing list
linux-pm@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linux-foundation.org/mailman/listinfo/linux-pm