On Mon, Apr 20, 2015 at 02:32:19PM +0300, Alexander Shishkin wrote: > The problem with using cycle counter for NMI watchdog is that its > frequency changes with the corresponding core's frequency. This means > that, in particular, if the core frequency scales up, watchdog NMI will > arrive more frequently than what user requested through watchdog_thresh > and also increasing the probability of setting off the hardlockup detector, > because the corresponding hrtimer will keep firing at the same intervals > regardless of the core frequency. And, if the core can turbo to up to 2.5x > its base frequency (and therefore TSC) [1], we'll have the hrtimer and NMI > counter firing at the same frequency; anything that disables interrupts at > an unfortunate moment can set off the hardlockup detector then. Hi Alexander, Can you respin this to be on top of Linus's latest tree. There is enough code changes in kernel/watchdog.c that I was hoping you could double check to make sure it still works. Cheers, Don > > The proposed solution is to use reference cycle counter instead, which is > guaranteed to run at the same frequency regardless of the cpu speed. This > will also ensure that the watchdog_thresh timeout value is honored. > > One issue with the reference counter is that its frequency is not the same > as that of tsc on older cpu models and therefore it needs calibration. > > This patch adds code to calibrate ref-cycles counter using an hrtimer and > a perf counter, which runs in parallel with the rest of kernel init > sequence. It also removes the architecture-specific hw_nmi_* code, in favor > of calculating the NMI sample period in the architecture independent code, > using the cpu_khz variable. I notice that powerpc just added > HARDLOCKUP_DETECTOR support, so this change is relevant to them as well. > > [1] http://ark.intel.com/products/83610/Intel-Core-M-5Y10-Processor-4M-Cache-up-to-2_00-GHz > > Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx > --- > arch/x86/kernel/apic/hw_nmi.c | 7 --- > include/linux/nmi.h | 2 - > kernel/watchdog.c | 128 +++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 125 insertions(+), 12 deletions(-) > > diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c > index 6a1e71bde3..f9191e918f 100644 > --- a/arch/x86/kernel/apic/hw_nmi.c > +++ b/arch/x86/kernel/apic/hw_nmi.c > @@ -19,13 +19,6 @@ > #include <linux/module.h> > #include <linux/delay.h> > > -#ifdef CONFIG_HARDLOCKUP_DETECTOR > -u64 hw_nmi_get_sample_period(int watchdog_thresh) > -{ > - return (u64)(cpu_khz) * 1000 * watchdog_thresh; > -} > -#endif > - > #ifdef arch_trigger_all_cpu_backtrace > /* For reliability, we're prepared to waste bits here. */ > static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; > diff --git a/include/linux/nmi.h b/include/linux/nmi.h > index 1d2a6ab6b8..139dea7b93 100644 > --- a/include/linux/nmi.h > +++ b/include/linux/nmi.h > @@ -53,8 +53,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void) > #endif > > #ifdef CONFIG_LOCKUP_DETECTOR > -int hw_nmi_is_cpu_stuck(struct pt_regs *); > -u64 hw_nmi_get_sample_period(int watchdog_thresh); > extern int watchdog_user_enabled; > extern int watchdog_thresh; > extern int sysctl_softlockup_all_cpu_backtrace; > diff --git a/kernel/watchdog.c b/kernel/watchdog.c > index df5494edf6..6b8f7a7c4d 100644 > --- a/kernel/watchdog.c > +++ b/kernel/watchdog.c > @@ -219,15 +219,130 @@ static int is_softlockup(unsigned long touch_ts) > } > > #ifdef CONFIG_HARDLOCKUP_DETECTOR > - > static struct perf_event_attr wd_hw_attr = { > .type = PERF_TYPE_HARDWARE, > - .config = PERF_COUNT_HW_CPU_CYCLES, > + .config = PERF_COUNT_HW_REF_CPU_CYCLES, > .size = sizeof(struct perf_event_attr), > .pinned = 1, > .disabled = 1, > }; > > +/* > + * Reference cycle counter calibration > + * > + * In order for nmi watchdog to be bound to the flow of time instead > + * of cycles, which changes with cpu frequency scaling, we need to use > + * PERF_COUNT_HW_REF_CPU_CYCLES for it. And for this we need to know its > + * resolution since it's not necessarily the same as that of TSC. > + * > + * Thus, reference clock counter is calibrated using a perf counter and > + * an hrtimer taking samples of the former's overflow counter. When the > + * hrtimer callback has enough samples, it kicks off a work, which does > + * the "math" and kickstarts the NMI watchdog if needed. > + * > + * This method of calibration does not give stellar precision, but should > + * be enough for a watchdog timer. And it runs in parallel to the rest of > + * the bootup sequence. The main factor contributing to the error in this > + * calibration method is the nmi handling path leading up to the overflow > + * handler, that is a greater REF_CAL_SAMPLE_CYCLES value gives better > + * precision. > + */ > + > +/* PERF_COUNT_HW_REF_CPU_CYCLES timer resolution */ > +static unsigned long wd_hw_ref_khz; > +static local_t wd_hw_ref_cycles; > +static struct hrtimer wd_hw_cal_timer; > +static struct perf_event *wd_hw_cal_event; > + > +#define REF_CAL_POINTS 9 > +static unsigned long ref_cal_point[REF_CAL_POINTS]; > +static unsigned int ref_cal_points; > + > +#define REF_CAL_SAMPLE_CYCLES 1000000 > +#define REF_CAL_MS 100 > +#define REF_CAL_PERIOD (REF_CAL_MS * 1000000) > + > +static void wd_hw_cal_overflow(struct perf_event *event, > + struct perf_sample_data *data, > + struct pt_regs *regs) > +{ > + event->hw.interrupts = 0; > + local_inc(&wd_hw_ref_cycles); > +} > + > +static void watchdog_calibration_failed(void) > +{ > + wd_hw_ref_khz = cpu_khz; > + wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES; > + pr_warn("reference counter calibration failed, falling back to cycle counter for NMI watchdog\n"); > +} > + > +static int watchdog_enable_all_cpus(bool sample_period_changed); > + > +static void watchdog_finish_ref_calibration(struct work_struct *work) > +{ > + unsigned int point; > + unsigned long diff; > + > + hrtimer_cancel(&wd_hw_cal_timer); > + perf_event_release_kernel(wd_hw_cal_event); > + > + for (diff = 0, point = 0; point < REF_CAL_POINTS - 1; point++) > + diff += ref_cal_point[point + 1] - ref_cal_point[point]; > + > + diff /= REF_CAL_POINTS - 1; > + > + wd_hw_ref_khz = diff * REF_CAL_SAMPLE_CYCLES / REF_CAL_MS; > + if (!wd_hw_ref_khz) > + watchdog_calibration_failed(); > + > + pr_info("reference clock frequency: %ldkHz\n", wd_hw_ref_khz); > + > + if (watchdog_user_enabled) > + watchdog_enable_all_cpus(false); > +} > + > +static DECLARE_WORK(wd_hw_ref_work, watchdog_finish_ref_calibration); > + > +static enum hrtimer_restart wd_hw_cal_timer_fn(struct hrtimer *hrtimer) > +{ > + hrtimer_forward_now(hrtimer, ns_to_ktime(REF_CAL_PERIOD)); > + ref_cal_point[ref_cal_points] = local_read(&wd_hw_ref_cycles); > + > + if (++ref_cal_points < REF_CAL_POINTS) > + return HRTIMER_RESTART; > + > + schedule_work(&wd_hw_ref_work); > + > + return HRTIMER_NORESTART; > +} > + > +/* > + * Calibrate HW_REF_CYCLE counter for NMI watchdog > + */ > +static int watchdog_calibrate(void) > +{ > + hrtimer_init(&wd_hw_cal_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + wd_hw_cal_timer.function = wd_hw_cal_timer_fn; > + > + wd_hw_attr.sample_period = REF_CAL_SAMPLE_CYCLES; > + wd_hw_cal_event = perf_event_create_kernel_counter(&wd_hw_attr, > + smp_processor_id(), > + NULL, > + wd_hw_cal_overflow, > + NULL); > + if (IS_ERR(wd_hw_cal_event)) { > + watchdog_calibration_failed(); > + return PTR_ERR(wd_hw_cal_event); > + } > + > + perf_event_enable(wd_hw_cal_event); > + hrtimer_start(&wd_hw_cal_timer, ns_to_ktime(REF_CAL_PERIOD), > + HRTIMER_MODE_REL_PINNED); > + > + return 0; > +} > + > /* Callback function for perf event subsystem */ > static void watchdog_overflow_callback(struct perf_event *event, > struct perf_sample_data *data, > @@ -455,7 +570,7 @@ static int watchdog_nmi_enable(unsigned int cpu) > goto out_enable; > > wd_attr = &wd_hw_attr; > - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); > + wd_attr->sample_period = wd_hw_ref_khz * 1000 * watchdog_thresh; > > /* Try to register using hardware perf events */ > event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); > @@ -641,6 +756,13 @@ void __init lockup_detector_init(void) > { > set_sample_period(); > > + /* > + * watchdog calibration work will take care of the rest when > + * the calibration is done > + */ > + if (!watchdog_calibrate()) > + return; > + > if (watchdog_user_enabled) > watchdog_enable_all_cpus(false); > } > -- > 2.1.4 > -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html