Re: [PATCH] watchdog: Use a reference cycle counter to avoid scaling issues

Don Zickus <dzickus@xxxxxxxxxx> · Mon, 20 Apr 2015 11:17:32 -0400

On Mon, Apr 20, 2015 at 02:32:19PM +0300, Alexander Shishkin wrote:
> The problem with using cycle counter for NMI watchdog is that its
> frequency changes with the corresponding core's frequency. This means
> that, in particular, if the core frequency scales up, watchdog NMI will
> arrive more frequently than what user requested through watchdog_thresh
> and also increasing the probability of setting off the hardlockup detector,
> because the corresponding hrtimer will keep firing at the same intervals
> regardless of the core frequency. And, if the core can turbo to up to 2.5x
> its base frequency (and therefore TSC) [1], we'll have the hrtimer and NMI
> counter firing at the same frequency; anything that disables interrupts at
> an unfortunate moment can set off the hardlockup detector then.

Yes!  Thank you!  I have been procrastinating fixing this for years because
I thought I had to hook into the cpufreq code or something.

> 
> The proposed solution is to use reference cycle counter instead, which is
> guaranteed to run at the same frequency regardless of the cpu speed. This
> will also ensure that the watchdog_thresh timeout value is honored.
> 
> One issue with the reference counter is that its frequency is not the same
> as that of tsc on older cpu models and therefore it needs calibration.
> 
> This patch adds code to calibrate ref-cycles counter using an hrtimer and
> a perf counter, which runs in parallel with the rest of kernel init
> sequence. It also removes the architecture-specific hw_nmi_* code, in favor
> of calculating the NMI sample period in the architecture independent code,
> using the cpu_khz variable. I notice that powerpc just added
> HARDLOCKUP_DETECTOR support, so this change is relevant to them as well.

Wow, added bonus!!  Thank you again!  And the extra piece of making this
arch-independent is awesome too.

Many kudos for this work.  I think it looks fine for the most part.  I have
to think about some of the subtleties (like watchdog_user inside the finish
func and the calibration in general), but I like it. :-)

Thanks again!

Cheers,
Don

> 
> [1] http://ark.intel.com/products/83610/Intel-Core-M-5Y10-Processor-4M-Cache-up-to-2_00-GHz
> 
> Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> ---
>  arch/x86/kernel/apic/hw_nmi.c |   7 ---
>  include/linux/nmi.h           |   2 -
>  kernel/watchdog.c             | 128 +++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 125 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
> index 6a1e71bde3..f9191e918f 100644
> --- a/arch/x86/kernel/apic/hw_nmi.c
> +++ b/arch/x86/kernel/apic/hw_nmi.c
> @@ -19,13 +19,6 @@
>  #include <linux/module.h>
>  #include <linux/delay.h>
>  
> -#ifdef CONFIG_HARDLOCKUP_DETECTOR
> -u64 hw_nmi_get_sample_period(int watchdog_thresh)
> -{
> -	return (u64)(cpu_khz) * 1000 * watchdog_thresh;
> -}
> -#endif
> -
>  #ifdef arch_trigger_all_cpu_backtrace
>  /* For reliability, we're prepared to waste bits here. */
>  static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 1d2a6ab6b8..139dea7b93 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -53,8 +53,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
>  #endif
>  
>  #ifdef CONFIG_LOCKUP_DETECTOR
> -int hw_nmi_is_cpu_stuck(struct pt_regs *);
> -u64 hw_nmi_get_sample_period(int watchdog_thresh);
>  extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index df5494edf6..6b8f7a7c4d 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -219,15 +219,130 @@ static int is_softlockup(unsigned long touch_ts)
>  }
>  
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
> -
>  static struct perf_event_attr wd_hw_attr = {
>  	.type		= PERF_TYPE_HARDWARE,
> -	.config		= PERF_COUNT_HW_CPU_CYCLES,
> +	.config		= PERF_COUNT_HW_REF_CPU_CYCLES,
>  	.size		= sizeof(struct perf_event_attr),
>  	.pinned		= 1,
>  	.disabled	= 1,
>  };
>  
> +/*
> + * Reference cycle counter calibration
> + *
> + * In order for nmi watchdog to be bound to the flow of time instead
> + * of cycles, which changes with cpu frequency scaling, we need to use
> + * PERF_COUNT_HW_REF_CPU_CYCLES for it. And for this we need to know its
> + * resolution since it's not necessarily the same as that of TSC.
> + *
> + * Thus, reference clock counter is calibrated using a perf counter and
> + * an hrtimer taking samples of the former's overflow counter. When the
> + * hrtimer callback has enough samples, it kicks off a work, which does
> + * the "math" and kickstarts the NMI watchdog if needed.
> + *
> + * This method of calibration does not give stellar precision, but should
> + * be enough for a watchdog timer. And it runs in parallel to the rest of
> + * the bootup sequence. The main factor contributing to the error in this
> + * calibration method is the nmi handling path leading up to the overflow
> + * handler, that is a greater REF_CAL_SAMPLE_CYCLES value gives better
> + * precision.
> + */
> +
> +/* PERF_COUNT_HW_REF_CPU_CYCLES timer resolution */
> +static unsigned long	wd_hw_ref_khz;
> +static local_t		wd_hw_ref_cycles;
> +static struct hrtimer	wd_hw_cal_timer;
> +static struct perf_event *wd_hw_cal_event;
> +
> +#define REF_CAL_POINTS 9
> +static unsigned long ref_cal_point[REF_CAL_POINTS];
> +static unsigned int ref_cal_points;
> +
> +#define REF_CAL_SAMPLE_CYCLES	1000000
> +#define REF_CAL_MS		100
> +#define REF_CAL_PERIOD		(REF_CAL_MS * 1000000)
> +
> +static void wd_hw_cal_overflow(struct perf_event *event,
> +			       struct perf_sample_data *data,
> +			       struct pt_regs *regs)
> +{
> +	event->hw.interrupts = 0;
> +	local_inc(&wd_hw_ref_cycles);
> +}
> +
> +static void watchdog_calibration_failed(void)
> +{
> +	wd_hw_ref_khz = cpu_khz;
> +	wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
> +	pr_warn("reference counter calibration failed, falling back to cycle counter for NMI watchdog\n");
> +}
> +
> +static int watchdog_enable_all_cpus(bool sample_period_changed);
> +
> +static void watchdog_finish_ref_calibration(struct work_struct *work)
> +{
> +	unsigned int point;
> +	unsigned long diff;
> +
> +	hrtimer_cancel(&wd_hw_cal_timer);
> +	perf_event_release_kernel(wd_hw_cal_event);
> +
> +	for (diff = 0, point = 0; point < REF_CAL_POINTS - 1; point++)
> +		diff += ref_cal_point[point + 1] - ref_cal_point[point];
> +
> +	diff /= REF_CAL_POINTS - 1;
> +
> +	wd_hw_ref_khz = diff * REF_CAL_SAMPLE_CYCLES / REF_CAL_MS;
> +	if (!wd_hw_ref_khz)
> +		watchdog_calibration_failed();
> +
> +	pr_info("reference clock frequency: %ldkHz\n", wd_hw_ref_khz);
> +
> +	if (watchdog_user_enabled)
> +		watchdog_enable_all_cpus(false);
> +}
> +
> +static DECLARE_WORK(wd_hw_ref_work, watchdog_finish_ref_calibration);
> +
> +static enum hrtimer_restart wd_hw_cal_timer_fn(struct hrtimer *hrtimer)
> +{
> +	hrtimer_forward_now(hrtimer, ns_to_ktime(REF_CAL_PERIOD));
> +	ref_cal_point[ref_cal_points] =	local_read(&wd_hw_ref_cycles);
> +
> +	if (++ref_cal_points < REF_CAL_POINTS)
> +		return HRTIMER_RESTART;
> +
> +	schedule_work(&wd_hw_ref_work);
> +
> +	return HRTIMER_NORESTART;
> +}
> +
> +/*
> + * Calibrate HW_REF_CYCLE counter for NMI watchdog
> + */
> +static int watchdog_calibrate(void)
> +{
> +	hrtimer_init(&wd_hw_cal_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +	wd_hw_cal_timer.function = wd_hw_cal_timer_fn;
> +
> +	wd_hw_attr.sample_period = REF_CAL_SAMPLE_CYCLES;
> +	wd_hw_cal_event = perf_event_create_kernel_counter(&wd_hw_attr,
> +							   smp_processor_id(),
> +							   NULL,
> +							   wd_hw_cal_overflow,
> +							   NULL);
> +	if (IS_ERR(wd_hw_cal_event)) {
> +		watchdog_calibration_failed();
> +		return PTR_ERR(wd_hw_cal_event);
> +	}
> +
> +	perf_event_enable(wd_hw_cal_event);
> +	hrtimer_start(&wd_hw_cal_timer, ns_to_ktime(REF_CAL_PERIOD),
> +		      HRTIMER_MODE_REL_PINNED);
> +
> +	return 0;
> +}
> +
>  /* Callback function for perf event subsystem */
>  static void watchdog_overflow_callback(struct perf_event *event,
>  		 struct perf_sample_data *data,
> @@ -455,7 +570,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
>  		goto out_enable;
>  
>  	wd_attr = &wd_hw_attr;
> -	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
> +	wd_attr->sample_period = wd_hw_ref_khz * 1000 * watchdog_thresh;
>  
>  	/* Try to register using hardware perf events */
>  	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
> @@ -641,6 +756,13 @@ void __init lockup_detector_init(void)
>  {
>  	set_sample_period();
>  
> +	/*
> +	 * watchdog calibration work will take care of the rest when
> +	 * the calibration is done
> +	 */
> +	if (!watchdog_calibrate())
> +		return;
> +
>  	if (watchdog_user_enabled)
>  		watchdog_enable_all_cpus(false);
>  }
> -- 
> 2.1.4
> 
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html