On Fri May 5, 2023 at 8:13 AM AEST, Douglas Anderson wrote: > The perf hardlockup detector works by looking at interrupt counts and > seeing if they change from run to run. The interrupt counts are > managed by the common watchdog code via its watchdog_timer_fn(). > > Currently the API between the perf detector and the common code is a > function: is_hardlockup(). When the hard lockup detector sees that > function return true then it handles printing out debug info and > inducing a panic if necessary. > > Let's change the API a little bit in preparation for the buddy > hardlockup detector. The buddy hardlockup detector wants to print I think the name change is a gratuitous. Especially since it's now static. watchdog_hardlockup_ is a pretty long prefix too, hardlockup_ should be enough? Seems okay otherwise though. Thanks, Nick > nearly the same debug info and have nearly the same panic > behavior. That means we want to move all that code to the common > file. For now, the code in the common file will only be there if the > perf hardlockup detector is enabled, but eventually it will be > selected by a common config. > > Right now, this _just_ moves the code from the perf detector file to > the common file and changes the names. It doesn't make the changes > that the buddy hardlockup detector will need and doesn't do any style > cleanups. A future patch will do cleanup to make it more obvious what > changed. > > With the above, we no longer have any callers of is_hardlockup() > outside of the "watchdog.c" file, so we can remove it from the header, > make it static, move it to the same "#ifdef" block as our new > watchdog_hardlockup_check(), and rename it to make it obvious it's > just for hardlockup detectors. While doing this, it can be noted that > even if no hardlockup detectors were configured the existing code used > to still have the code for counting/checking "hrtimer_interrupts" even > if the perf hardlockup detector wasn't configured. We didn't need to > do that, so move all the "hrtimer_interrupts" counting to only be > there if the perf hardlockup detector is configured as well. > > This change is expected to be a no-op. > > Signed-off-by: Douglas Anderson <dianders@xxxxxxxxxxxx> > --- > > Changes in v4: > - ("Move perf hardlockup checking/panic ...") new for v4. > > include/linux/nmi.h | 5 ++- > kernel/watchdog.c | 92 +++++++++++++++++++++++++++++++++--------- > kernel/watchdog_perf.c | 42 +------------------ > 3 files changed, 78 insertions(+), 61 deletions(-) > > diff --git a/include/linux/nmi.h b/include/linux/nmi.h > index 35d09d70f394..c6cb9bc5dc80 100644 > --- a/include/linux/nmi.h > +++ b/include/linux/nmi.h > @@ -15,7 +15,6 @@ > void lockup_detector_init(void); > void lockup_detector_soft_poweroff(void); > void lockup_detector_cleanup(void); > -bool is_hardlockup(void); > > extern int watchdog_user_enabled; > extern int nmi_watchdog_user_enabled; > @@ -88,6 +87,10 @@ extern unsigned int hardlockup_panic; > static inline void hardlockup_detector_disable(void) {} > #endif > > +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) > +void watchdog_hardlockup_check(struct pt_regs *regs); > +#endif > + > #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) > # define NMI_WATCHDOG_SYSCTL_PERM 0644 > #else > diff --git a/kernel/watchdog.c b/kernel/watchdog.c > index c705a18b26bf..2d319cdf64b9 100644 > --- a/kernel/watchdog.c > +++ b/kernel/watchdog.c > @@ -85,6 +85,78 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); > > #endif /* CONFIG_HARDLOCKUP_DETECTOR */ > > +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) > + > +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); > +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); > +static DEFINE_PER_CPU(bool, hard_watchdog_warn); > +static unsigned long hardlockup_allcpu_dumped; > + > +static bool watchdog_hardlockup_is_lockedup(void) > +{ > + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); > + > + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) > + return true; > + > + __this_cpu_write(hrtimer_interrupts_saved, hrint); > + return false; > +} > + > +static void watchdog_hardlockup_interrupt_count(void) > +{ > + __this_cpu_inc(hrtimer_interrupts); > +} > + > +void watchdog_hardlockup_check(struct pt_regs *regs) > +{ > + /* check for a hardlockup > + * This is done by making sure our timer interrupt > + * is incrementing. The timer interrupt should have > + * fired multiple times before we overflow'd. If it hasn't > + * then this is a good indication the cpu is stuck > + */ > + if (watchdog_hardlockup_is_lockedup()) { > + int this_cpu = smp_processor_id(); > + > + /* only print hardlockups once */ > + if (__this_cpu_read(hard_watchdog_warn) == true) > + return; > + > + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", > + this_cpu); > + print_modules(); > + print_irqtrace_events(current); > + if (regs) > + show_regs(regs); > + else > + dump_stack(); > + > + /* > + * Perform all-CPU dump only once to avoid multiple hardlockups > + * generating interleaving traces > + */ > + if (sysctl_hardlockup_all_cpu_backtrace && > + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) > + trigger_allbutself_cpu_backtrace(); > + > + if (hardlockup_panic) > + nmi_panic(regs, "Hard LOCKUP"); > + > + __this_cpu_write(hard_watchdog_warn, true); > + return; > + } > + > + __this_cpu_write(hard_watchdog_warn, false); > + return; > +} > + > +#else /* CONFIG_HARDLOCKUP_DETECTOR_PERF */ > + > +static inline void watchdog_hardlockup_interrupt_count(void) { } > + > +#endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */ > + > /* > * These functions can be overridden if an architecture implements its > * own hardlockup detector. > @@ -176,8 +248,6 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); > static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); > static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); > static DEFINE_PER_CPU(bool, softlockup_touch_sync); > -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); > -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); > static unsigned long soft_lockup_nmi_warn; > > static int __init nowatchdog_setup(char *str) > @@ -312,22 +382,6 @@ static int is_softlockup(unsigned long touch_ts, > } > > /* watchdog detector functions */ > -bool is_hardlockup(void) > -{ > - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); > - > - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) > - return true; > - > - __this_cpu_write(hrtimer_interrupts_saved, hrint); > - return false; > -} > - > -static void watchdog_interrupt_count(void) > -{ > - __this_cpu_inc(hrtimer_interrupts); > -} > - > static DEFINE_PER_CPU(struct completion, softlockup_completion); > static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); > > @@ -359,7 +413,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > return HRTIMER_NORESTART; > > /* kick the hardlockup detector */ > - watchdog_interrupt_count(); > + watchdog_hardlockup_interrupt_count(); > > /* kick the softlockup detector */ > if (completion_done(this_cpu_ptr(&softlockup_completion))) { > diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c > index c3d8ceb149da..5f3651b87ee7 100644 > --- a/kernel/watchdog_perf.c > +++ b/kernel/watchdog_perf.c > @@ -20,13 +20,11 @@ > #include <asm/irq_regs.h> > #include <linux/perf_event.h> > > -static DEFINE_PER_CPU(bool, hard_watchdog_warn); > static DEFINE_PER_CPU(bool, watchdog_nmi_touch); > static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); > static DEFINE_PER_CPU(struct perf_event *, dead_event); > static struct cpumask dead_events_mask; > > -static unsigned long hardlockup_allcpu_dumped; > static atomic_t watchdog_cpus = ATOMIC_INIT(0); > > notrace void arch_touch_nmi_watchdog(void) > @@ -122,45 +120,7 @@ static void watchdog_overflow_callback(struct perf_event *event, > if (!watchdog_check_timestamp()) > return; > > - /* check for a hardlockup > - * This is done by making sure our timer interrupt > - * is incrementing. The timer interrupt should have > - * fired multiple times before we overflow'd. If it hasn't > - * then this is a good indication the cpu is stuck > - */ > - if (is_hardlockup()) { > - int this_cpu = smp_processor_id(); > - > - /* only print hardlockups once */ > - if (__this_cpu_read(hard_watchdog_warn) == true) > - return; > - > - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", > - this_cpu); > - print_modules(); > - print_irqtrace_events(current); > - if (regs) > - show_regs(regs); > - else > - dump_stack(); > - > - /* > - * Perform all-CPU dump only once to avoid multiple hardlockups > - * generating interleaving traces > - */ > - if (sysctl_hardlockup_all_cpu_backtrace && > - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) > - trigger_allbutself_cpu_backtrace(); > - > - if (hardlockup_panic) > - nmi_panic(regs, "Hard LOCKUP"); > - > - __this_cpu_write(hard_watchdog_warn, true); > - return; > - } > - > - __this_cpu_write(hard_watchdog_warn, false); > - return; > + watchdog_hardlockup_check(regs); > } > > static int hardlockup_detector_event_create(void) > -- > 2.40.1.521.gf1e218fcd8-goog