[RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

[Purpose]
Kexec may trigger additional hardware errors and multiply the damage 
if it works after MCE occurred because there are some hardware-related 
operations in kexec as follows.
  - Sending NMI to cpus
  - Initializing hardware during boot process of second kernel.
  - Accessing to memory and dumping it to disks.

So, I propose adding a new option controlling kexec behaviour when MCE 
occurred.
This patch prevents unnecessary hardware errors and avoid expanding 
the damage.

[Patch Description]
I added a sysctl option ,kernel.kexec_on_mce, controlling kexec behaviour 
when MCE occurred.

 - Permission
   - 0644
 - Value(default is "1")
   - non-zero: Kexec is enabled regardless of MCE.
   - 0: Kexec is disabled when MCE occurred.

Matrix of kernel.kexec_on_mce value, MCE and kexec behaviour

--------------------------------------------------
kernel.kexec_on_mce| MCE          | kexec behaviour
--------------------------------------------------
non-zero           | occurred     | enabled
                   -------------------------------
                   | not occurred | enabled
--------------------------------------------------
0                  | occurred     | disabled
                   |------------------------------
                   | not occurred | enabled
--------------------------------------------------

Any comments and suggestions are welcome.

Signed-off-by: Seiji Aguchi <seiji.aguchi@xxxxxxx>

---
 Documentation/sysctl/kernel.txt  |   12 ++++++++++++
 arch/x86/include/asm/mce.h       |    2 ++
 arch/x86/kernel/cpu/mcheck/mce.c |    4 ++++
 include/linux/sysctl.h           |    1 +
 kernel/kexec.c                   |    7 +++++++
 kernel/sysctl.c                  |   12 ++++++++++++
 kernel/sysctl_binary.c           |    1 +
 mm/memory-failure.c              |    9 +++++++++
 8 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 209e158..ce3240e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kexec_on_mce                [ X86 only ]
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -261,6 +262,17 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kexec_on_mce: (X86 only)
+
+Controls the kexec behaviour when MCE occurred.
+Default value is 1.
+
+0: Kexec is disabled when MCE occurred.
+non-zero: Kexec is enabled regardless of MCE.
+
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13c..062dabd 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -123,6 +123,8 @@ extern struct atomic_notifier_head x86_mce_decoder_chain;
 
 extern int mce_disabled;
 extern int mce_p5_enabled;
+extern int kexec_on_mce;
+extern int mce_flag;
 
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a35b72..edbaf77 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -85,6 +85,8 @@ static int			mce_dont_log_ce		__read_mostly;
 int				mce_cmci_disabled	__read_mostly;
 int				mce_ignore_ce		__read_mostly;
 int				mce_ser			__read_mostly;
+int				kexec_on_mce = 1;
+int				mce_flag;
 
 struct mce_bank                *mce_banks		__read_mostly;
 
@@ -944,6 +946,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	percpu_inc(mce_exception_count);
 
+	mce_flag = 1;
+
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
 		goto out;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7bb5cb6..0ebe708 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
 	KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+	KERN_KEXEC_ON_MCE=77, /* int: whether we will dump memory on mce */
 };
 
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045b..3e5c41a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -39,6 +39,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/sections.h>
+#include <asm/mce.h>
 
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
@@ -1074,6 +1075,12 @@ void crash_kexec(struct pt_regs *regs)
 	 * of memory the xchg(&kexec_crash_image) would be
 	 * sufficient.  But since I reuse the memory...
 	 */
+#ifdef CONFIG_X86_MCE
+	if (!kexec_on_mce && mce_flag) {
+		printk(KERN_WARNING "Kexec is disabled because MCE occurred\n");
+		return;
+	}
+#endif
 	if (mutex_trylock(&kexec_mutex)) {
 		if (kexec_crash_image) {
 			struct pt_regs fixed_regs;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa15..3a64cd6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,9 @@
 #include <linux/nmi.h>
 #endif
 
+#ifdef CONFIG_X86_MCE
+#include <asm/mce.h>
+#endif
 
 #if defined(CONFIG_SYSCTL)
 
@@ -963,6 +966,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#if defined(CONFIG_X86_MCE)
+	{
+		.procname	= "kexec_on_mce",
+		.data		= &kexec_on_mce,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c57..a25f971 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -138,6 +138,7 @@ static const struct bin_table bin_kern_table[] = {
 	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
 	{ CTL_INT,	KERN_NMI_WATCHDOG,		"nmi_watchdog" },
 	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
+	{ CTL_INT,	KERN_KEXEC_ON_MCE,		"kexec_on_mce" },
 	{}
 };
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c0..3ec075a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,11 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
+
+#ifdef CONFIG_X86_MCE
+#include <asm/mce.h>
+#endif
+
 #include "internal.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -949,6 +954,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	int res;
 	unsigned int nr_pages;
 
+#ifdef CONFIG_X86_MCE
+	mce_flag = 1;
+#endif
+
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
 
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]