nmi_shootdown_cpus(), a subroutine of crash_kexec(), sends NMI IPI to non-panic cpus to stop them while saving their register information and doing some cleanups for crash dumping. So if a non-panic cpus is infinitely looping in NMI context, we fail to save its register information and lose the information from the crash dump. `Infinite loop in NMI context' can happen: a. when a cpu panics on NMI while another cpu is processing panic b. when a cpu received an external or unknown NMI while another cpu is processing panic on NMI In the case of a, it loops in panic_smp_self_stop(). In the case of b, it loops in raw_spin_lock() of nmi_reason_lock. This can happen on some servers which broadcasts NMIs to all CPUs when a dump button is pushed. To save registers in these case too, this patch does following things: 1. Move the timing of `infinite loop in NMI context' (actually done by panic_smp_self_stop()) outside of panic() to enable us to refer pt_regs 2. call a callback of nmi_shootdown_cpus() directly to save registers and do some cleanups after setting waiting_for_crash_ipi which is used for counting down the number of cpus which handled the callback V5: - Use WRITE_ONCE() when setting crash_ipi_done to 1 so that the compiler doesn't change the instruction order - Support the case of b in the above description - Add poll_crash_ipi_and_callback() V4: - Rewrite the patch description V3: - Newly introduced Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@xxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Eric Biederman <ebiederm@xxxxxxxxxxxx> Cc: Vivek Goyal <vgoyal@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxxxx> --- arch/x86/include/asm/reboot.h | 1 + arch/x86/kernel/nmi.c | 17 +++++++++++++---- arch/x86/kernel/reboot.c | 28 ++++++++++++++++++++++++++++ include/linux/kernel.h | 12 ++++++++++-- kernel/panic.c | 10 ++++++++++ kernel/watchdog.c | 2 +- 6 files changed, 63 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index a82c4f1..964e82f 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type); typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); +void poll_crash_ipi_and_callback(struct pt_regs *regs); #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 5131714..74a1434 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -29,6 +29,7 @@ #include <asm/mach_traps.h> #include <asm/nmi.h> #include <asm/x86_init.h> +#include <asm/reboot.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -231,7 +232,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) #endif if (panic_on_unrecovered_nmi) - nmi_panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); @@ -256,7 +257,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) show_regs(regs); if (panic_on_io_nmi) { - nmi_panic("NMI IOCK error: Not continuing"); + nmi_panic(regs, "NMI IOCK error: Not continuing"); /* * If we return from nmi_panic(), it means we have received @@ -305,7 +306,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) - nmi_panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); } @@ -357,7 +358,15 @@ static void default_do_nmi(struct pt_regs *regs) } /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ - raw_spin_lock(&nmi_reason_lock); + + /* + * Another CPU may be processing panic routines with holding + * nmi_reason_lock. Check IPI issuance from the panicking CPU + * and call the callback directly. + */ + while (!raw_spin_trylock(&nmi_reason_lock)) + poll_crash_ipi_and_callback(regs); + reason = x86_platform.get_nmi_reason(); if (reason & NMI_REASON_MASK) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 02693dd..44c5f5b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -718,6 +718,7 @@ static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; +static int crash_ipi_done; static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { @@ -780,6 +781,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) smp_send_nmi_allbutself(); + /* Kick cpus looping in nmi context. */ + WRITE_ONCE(crash_ipi_done, 1); + msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { mdelay(1); @@ -788,9 +792,33 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } + +/* + * Wait for the timing of IPI for crash dumping, and then call its callback + * directly. This function is used when we have already been in NMI handler. + */ +void poll_crash_ipi_and_callback(struct pt_regs *regs) +{ + if (crash_ipi_done) + crash_nmi_callback(0, regs); /* Shouldn't return */ +} + +/* Override the weak function in kernel/panic.c */ +void nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + poll_crash_ipi_and_callback(regs); + cpu_relax(); + } +} + #else /* !CONFIG_SMP */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { /* No other CPUs to shoot down */ } + +void poll_crash_ipi_and_callback(struct pt_regs *regs) +{ +} #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 480a4fd..728a31b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -255,6 +255,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; +void nmi_panic_self_stop(struct pt_regs *); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); @@ -450,12 +451,19 @@ extern atomic_t panic_cpu; /* * A variant of panic() called from NMI context. * If we've already panicked on this cpu, return from here. + * If another cpu already panicked, loop in nmi_panic_self_stop() which + * can provide architecture dependent code such as saving register states + * for crash dump. */ -#define nmi_panic(fmt, ...) \ +#define nmi_panic(regs, fmt, ...) \ do { \ + int old_cpu; \ int this_cpu = raw_smp_processor_id(); \ - if (atomic_cmpxchg(&panic_cpu, -1, this_cpu) != this_cpu) \ + old_cpu = atomic_cmpxchg(&panic_cpu, -1, this_cpu); \ + if (old_cpu == -1) \ panic(fmt, ##__VA_ARGS__); \ + else if (old_cpu != this_cpu) \ + nmi_panic_self_stop(regs); \ } while (0) /* diff --git a/kernel/panic.c b/kernel/panic.c index 24ee2ea..4fce2be 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -61,6 +61,16 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +/* + * Stop ourself in NMI context if another cpu has already panicked. + * Architecture code may override this to prepare for crash dumping + * (e.g. save register information). + */ +void __weak nmi_panic_self_stop(struct pt_regs *regs) +{ + panic_smp_self_stop(); +} + atomic_t panic_cpu = ATOMIC_INIT(-1); /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b9be18f..84b5035 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event, trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) - nmi_panic("Hard LOCKUP"); + nmi_panic(regs, "Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html