[PATCH] watchdog: Use a reference cycle counter to avoid scaling issues

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The problem with using cycle counter for NMI watchdog is that its
frequency changes with the corresponding core's frequency. This means
that, in particular, if the core frequency scales up, watchdog NMI will
arrive more frequently than what user requested through watchdog_thresh
and also increasing the probability of setting off the hardlockup detector,
because the corresponding hrtimer will keep firing at the same intervals
regardless of the core frequency. And, if the core can turbo to up to 2.5x
its base frequency (and therefore TSC) [1], we'll have the hrtimer and NMI
counter firing at the same frequency; anything that disables interrupts at
an unfortunate moment can set off the hardlockup detector then.

The proposed solution is to use reference cycle counter instead, which is
guaranteed to run at the same frequency regardless of the cpu speed. This
will also ensure that the watchdog_thresh timeout value is honored.

One issue with the reference counter is that its frequency is not the same
as that of tsc on older cpu models and therefore it needs calibration.

This patch adds code to calibrate ref-cycles counter using an hrtimer and
a perf counter, which runs in parallel with the rest of kernel init
sequence. It also removes the architecture-specific hw_nmi_* code, in favor
of calculating the NMI sample period in the architecture independent code,
using the cpu_khz variable. I notice that powerpc just added
HARDLOCKUP_DETECTOR support, so this change is relevant to them as well.

[1] http://ark.intel.com/products/83610/Intel-Core-M-5Y10-Processor-4M-Cache-up-to-2_00-GHz

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
---
 arch/x86/kernel/apic/hw_nmi.c |   7 ---
 include/linux/nmi.h           |   2 -
 kernel/watchdog.c             | 128 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6a1e71bde3..f9191e918f 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,13 +19,6 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-u64 hw_nmi_get_sample_period(int watchdog_thresh)
-{
-	return (u64)(cpu_khz) * 1000 * watchdog_thresh;
-}
-#endif
-
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 1d2a6ab6b8..139dea7b93 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -53,8 +53,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
 #endif
 
 #ifdef CONFIG_LOCKUP_DETECTOR
-int hw_nmi_is_cpu_stuck(struct pt_regs *);
-u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern int sysctl_softlockup_all_cpu_backtrace;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index df5494edf6..6b8f7a7c4d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -219,15 +219,130 @@ static int is_softlockup(unsigned long touch_ts)
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.config		= PERF_COUNT_HW_REF_CPU_CYCLES,
 	.size		= sizeof(struct perf_event_attr),
 	.pinned		= 1,
 	.disabled	= 1,
 };
 
+/*
+ * Reference cycle counter calibration
+ *
+ * In order for nmi watchdog to be bound to the flow of time instead
+ * of cycles, which changes with cpu frequency scaling, we need to use
+ * PERF_COUNT_HW_REF_CPU_CYCLES for it. And for this we need to know its
+ * resolution since it's not necessarily the same as that of TSC.
+ *
+ * Thus, reference clock counter is calibrated using a perf counter and
+ * an hrtimer taking samples of the former's overflow counter. When the
+ * hrtimer callback has enough samples, it kicks off a work, which does
+ * the "math" and kickstarts the NMI watchdog if needed.
+ *
+ * This method of calibration does not give stellar precision, but should
+ * be enough for a watchdog timer. And it runs in parallel to the rest of
+ * the bootup sequence. The main factor contributing to the error in this
+ * calibration method is the nmi handling path leading up to the overflow
+ * handler, that is a greater REF_CAL_SAMPLE_CYCLES value gives better
+ * precision.
+ */
+
+/* PERF_COUNT_HW_REF_CPU_CYCLES timer resolution */
+static unsigned long	wd_hw_ref_khz;
+static local_t		wd_hw_ref_cycles;
+static struct hrtimer	wd_hw_cal_timer;
+static struct perf_event *wd_hw_cal_event;
+
+#define REF_CAL_POINTS 9
+static unsigned long ref_cal_point[REF_CAL_POINTS];
+static unsigned int ref_cal_points;
+
+#define REF_CAL_SAMPLE_CYCLES	1000000
+#define REF_CAL_MS		100
+#define REF_CAL_PERIOD		(REF_CAL_MS * 1000000)
+
+static void wd_hw_cal_overflow(struct perf_event *event,
+			       struct perf_sample_data *data,
+			       struct pt_regs *regs)
+{
+	event->hw.interrupts = 0;
+	local_inc(&wd_hw_ref_cycles);
+}
+
+static void watchdog_calibration_failed(void)
+{
+	wd_hw_ref_khz = cpu_khz;
+	wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
+	pr_warn("reference counter calibration failed, falling back to cycle counter for NMI watchdog\n");
+}
+
+static int watchdog_enable_all_cpus(bool sample_period_changed);
+
+static void watchdog_finish_ref_calibration(struct work_struct *work)
+{
+	unsigned int point;
+	unsigned long diff;
+
+	hrtimer_cancel(&wd_hw_cal_timer);
+	perf_event_release_kernel(wd_hw_cal_event);
+
+	for (diff = 0, point = 0; point < REF_CAL_POINTS - 1; point++)
+		diff += ref_cal_point[point + 1] - ref_cal_point[point];
+
+	diff /= REF_CAL_POINTS - 1;
+
+	wd_hw_ref_khz = diff * REF_CAL_SAMPLE_CYCLES / REF_CAL_MS;
+	if (!wd_hw_ref_khz)
+		watchdog_calibration_failed();
+
+	pr_info("reference clock frequency: %ldkHz\n", wd_hw_ref_khz);
+
+	if (watchdog_user_enabled)
+		watchdog_enable_all_cpus(false);
+}
+
+static DECLARE_WORK(wd_hw_ref_work, watchdog_finish_ref_calibration);
+
+static enum hrtimer_restart wd_hw_cal_timer_fn(struct hrtimer *hrtimer)
+{
+	hrtimer_forward_now(hrtimer, ns_to_ktime(REF_CAL_PERIOD));
+	ref_cal_point[ref_cal_points] =	local_read(&wd_hw_ref_cycles);
+
+	if (++ref_cal_points < REF_CAL_POINTS)
+		return HRTIMER_RESTART;
+
+	schedule_work(&wd_hw_ref_work);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Calibrate HW_REF_CYCLE counter for NMI watchdog
+ */
+static int watchdog_calibrate(void)
+{
+	hrtimer_init(&wd_hw_cal_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	wd_hw_cal_timer.function = wd_hw_cal_timer_fn;
+
+	wd_hw_attr.sample_period = REF_CAL_SAMPLE_CYCLES;
+	wd_hw_cal_event = perf_event_create_kernel_counter(&wd_hw_attr,
+							   smp_processor_id(),
+							   NULL,
+							   wd_hw_cal_overflow,
+							   NULL);
+	if (IS_ERR(wd_hw_cal_event)) {
+		watchdog_calibration_failed();
+		return PTR_ERR(wd_hw_cal_event);
+	}
+
+	perf_event_enable(wd_hw_cal_event);
+	hrtimer_start(&wd_hw_cal_timer, ns_to_ktime(REF_CAL_PERIOD),
+		      HRTIMER_MODE_REL_PINNED);
+
+	return 0;
+}
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
 		 struct perf_sample_data *data,
@@ -455,7 +570,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
 		goto out_enable;
 
 	wd_attr = &wd_hw_attr;
-	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	wd_attr->sample_period = wd_hw_ref_khz * 1000 * watchdog_thresh;
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
@@ -641,6 +756,13 @@ void __init lockup_detector_init(void)
 {
 	set_sample_period();
 
+	/*
+	 * watchdog calibration work will take care of the rest when
+	 * the calibration is done
+	 */
+	if (!watchdog_calibrate())
+		return;
+
 	if (watchdog_user_enabled)
 		watchdog_enable_all_cpus(false);
 }
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]