Re: [REGRESSION] Needless shutting down of oneshot timer in nohz mode

Thomas Gleixner <tglx@xxxxxxxxxxxxx> · Sun, 13 Sep 2020 21:08:49 +0200

Hi!,

On Fri, Sep 11 2020 at 23:48, Steven Rostedt wrote:
> The VMware PhotonOS team is evaluating 4.19-rt compared to CentOS
> 3.10-rt (franken kernel from Red Hat). They found a regression between
> the two kernels that was found to be introduced by:
>
>  d25408756accb ("clockevents: Stop unused clockevent devices")
>
> The issue is running this on a guest, and it causes a noticeable wake
> up latency in cyclictest. The 4.19-rt kernel has two extra apic
> instructions causing for two extra VMEXITs to occur over the 3.10-rt
> kernel. I found out the reason why, and this is true for vanilla 5.9-rc
> as well.
>
> When running isocpus with NOHZ_FULL, I see the following.
>
>   tick_nohz_idle_stop_tick() {
> 	hrtimer_start_range_ns() {
> 		remove_hrtimer(timer)
> 			/* no more timers on the base */
> 			expires = KTIME_MAX;
> 			tick_program_event() {
> 				clock_switch_state(ONESHOT_STOPPED);
> 				/* call to apic to shutdown timer */
> 			}
> 		}
> 		[..]
> 		hrtimer_reprogram(timer) {
> 			tick_program_event() {
> 				clock_switch_state(ONESHOT);
> 				/* call to apic to enable timer again! */
> 		}
> 	}
>  }
>
>
> Thus, we are needlessly shutting down and restarting the apic every
> time we call tick_nohz_stop_tick() if there is a timer still on the
> queue.
>
> I'm not exactly sure how to fix this. Is there a way we can hold off
> disabling the clock here until we know that it isn't going to be
> immediately enabled again?

For the hrtimer_start_range_ns() case we can do that. Something like the
completely untested below.

Thanks,

        tglx
---

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 95b6a708b040..9931a7f66e47 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -209,6 +209,9 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 	return base;
 }
 
+static void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal);
+
 /*
  * We switch the timer base to a power-optimized selected CPU target,
  * if:
@@ -223,7 +226,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
  */
 static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
-		    int pinned)
+		    int pinned, bool *reprogram_old_base)
 {
 	struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
 	struct hrtimer_clock_base *new_base;
@@ -247,6 +250,23 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
+		/*
+		 * The caller has removed the first expiring timer from
+		 * @base, but avoided reprogramming the clocksource as it
+		 * immediately enqueues a timer again. If the base stays
+		 * the same and the removed timer was the only timer on
+		 * that CPU base then reprogramming in hrtimer_remove()
+		 * would shut down the clock event device just to restart
+		 * it when the timer is enqueued.
+		 *
+		 * timer->base->lock is about to be dropped. Check whether
+		 * the current base needs an update.
+		 */
+		if (*reprogram_old_base) {
+			*reprogram_old_base = false;
+			hrtimer_force_reprogram(base->cpu_base, 1);
+		}
+
 		/* See the comment in lock_hrtimer_base() */
 		WRITE_ONCE(timer->base, &migration_base);
 		raw_spin_unlock(&base->cpu_base->lock);
@@ -288,7 +308,12 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 	return base;
 }
 
-# define switch_hrtimer_base(t, b, p)	(b)
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+		    int pinned, bool *reprogram_old_base)
+{
+	return base;
+}
 
 #endif	/* !CONFIG_SMP */
 
@@ -1090,9 +1115,20 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 				    struct hrtimer_clock_base *base)
 {
 	struct hrtimer_clock_base *new_base;
+	bool reprogram_old_base;
+	int ret;
+
+	/*
+	 * If this is the first expiring timer then after removing the
+	 * timer the clock event needs to be reprogrammed. But if the timer
+	 * stays on the same base then this might be a pointless exercise
+	 * because it's immediately enqueued again. Store the state and
+	 * delay reprogramming. See below.
+	 */
+	reprogram_old_base = timer == base->cpu_base->next_timer;
 
 	/* Remove an active timer from the queue: */
-	remove_hrtimer(timer, base, true);
+	remove_hrtimer(timer, base, false);
 
 	if (mode & HRTIMER_MODE_REL)
 		tim = ktime_add_safe(tim, base->get_time());
@@ -1101,10 +1137,21 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
-	/* Switch the timer base, if necessary: */
-	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+	/*
+	 * Switch the timer base, if necessary. It the timer was the first
+	 * expiring timer and the timer base is switched then the old base
+	 * is reprogrammed before dropping the cpu base lock. In that case
+	 * reprogram_old_base is then set to false.
+	 */
+	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED,
+				       &reprogram_old_base);
 
-	return enqueue_hrtimer(timer, new_base, mode);
+	ret = enqueue_hrtimer(timer, new_base, mode);
+	if (reprogram_old_base) {
+		hrtimer_force_reprogram(base->cpu_base, 1);
+		ret = 0;
+	}
+	return ret;
 }
 
 /**