Fernando Luis V?zquez Cao <fernando at oss.ntt.co.jp> writes: > Subject: [PATCH] boot: ignore early NMIs > > From: Fernando Luis Vazquez Cao <fernando at oss.ntt.co.jp> > > NMIs very early in the boot process are rarely critical (usually > it just means that there was a spurious bit flip somewhere in the > hardware, or that this is a kdump kernel and we received an NMI > generated in the previous context), so the current behavior of > halting the system when one occurs is probably a bit over the top. > > This patch changes the early IDT so that NMIs are ignored and the > kernel can, hopefully, continue executing other code. Harsher > measures (panic, etc) are defered to the final NMI handler, which > can actually make an informed decision. > > This issue presented itself in our environment as seemingly > random hangs in kdump. > > Signed-off-by: Fernando Luis Vazquez Cao <fernando at oss.ntt.co.jp> > --- > > diff -urNp linux-3.3-rc6-orig/arch/x86/kernel/head64.c linux-3.3-rc6/arch/x86/kernel/head64.c > --- linux-3.3-rc6-orig/arch/x86/kernel/head64.c 2012-03-07 15:49:01.834241787 +0900 > +++ linux-3.3-rc6/arch/x86/kernel/head64.c 2012-03-07 18:39:03.173732875 +0900 > @@ -71,7 +71,7 @@ void __init x86_64_start_kernel(char * r > (__START_KERNEL & PGDIR_MASK))); > BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); > > - /* clear bss before set_intr_gate with early_idt_handler */ > + /* clear bss before set_intr_gate with early_idt_handlers */ > clear_bss(); > > /* Make NULL pointers segfault */ > @@ -79,13 +79,8 @@ void __init x86_64_start_kernel(char * r > > max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; > > - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { > -#ifdef CONFIG_EARLY_PRINTK > + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) > set_intr_gate(i, &early_idt_handlers[i]); > -#else > - set_intr_gate(i, early_idt_handler); > -#endif > - } > load_idt((const struct desc_ptr *)&idt_descr); > > if (console_loglevel == 10) > diff -urNp linux-3.3-rc6-orig/arch/x86/kernel/head_64.S linux-3.3-rc6/arch/x86/kernel/head_64.S > --- linux-3.3-rc6-orig/arch/x86/kernel/head_64.S 2012-03-07 15:49:01.838241839 +0900 > +++ linux-3.3-rc6/arch/x86/kernel/head_64.S 2012-03-07 18:41:21.811516876 +0900 > @@ -270,18 +270,29 @@ bad_address: > jmp bad_address > > .section ".init.text","ax" > -#ifdef CONFIG_EARLY_PRINTK > .globl early_idt_handlers > early_idt_handlers: > - i = 0 > + vector = 0 > .rept NUM_EXCEPTION_VECTORS > - movl $i, %esi > - jmp early_idt_handler > - i = i + 1 > + /* > + * NMIs (vector 2) this early in the boot process are rarely critical > + * (usually it just means that there was a spurious bit flip somewhere > + * in the hardware, or that this is a kdump kernel and we received an > + * NMI generated in the previous context), so we ignore them here and > + * try to continue (see early_nmi_handler implementation below). > + * Harsher measures (panic, etc) are defered to the final NMI handler, > + * which can actually make an informed decision. > + */ > + .if vector == 2 > + jmp early_nmi_handler Is just a jump and not a move followed by a jump still 10 bytes? I hate to say it but I think this fails miserably for any exception after a nmi. I expect the simplest solution is to modify early_idt_handler to test for vector == 2. Doing something less brittle than: > extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; in segment.h might be a good idea as well. Eric > + .else > + movl $vector, %esi > + jmp early_exception_handler > + .endif > + vector = vector + 1 > .endr > -#endif > > -ENTRY(early_idt_handler) > +early_exception_handler: > #ifdef CONFIG_EARLY_PRINTK > cmpl $2,early_recursion_flag(%rip) > jz 1f > @@ -315,6 +326,9 @@ ENTRY(early_idt_handler) > 1: hlt > jmp 1b > > +early_nmi_handler: > + iretq > + > #ifdef CONFIG_EARLY_PRINTK > early_recursion_flag: > .long 0