[tip:ras/urgent] x86/mce: Make the MCE notifier a blocking one

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Commit-ID:  0dc9c639e6553e39c13b2c0d54c8a1b098cb95e2
Gitweb:     http://git.kernel.org/tip/0dc9c639e6553e39c13b2c0d54c8a1b098cb95e2
Author:     Vishal Verma <vishal.l.verma@xxxxxxxxx>
AuthorDate: Tue, 18 Apr 2017 20:42:35 +0200
Committer:  Thomas Gleixner <tglx@xxxxxxxxxxxxx>
CommitDate: Tue, 18 Apr 2017 22:23:48 +0200

x86/mce: Make the MCE notifier a blocking one

The NFIT MCE handler callback (for handling media errors on NVDIMMs)
takes a mutex to add the location of a memory error to a list. But since
the notifier call chain for machine checks (x86_mce_decoder_chain) is
atomic, we get a lockdep splat like:

  BUG: sleeping function called from invalid context at kernel/locking/mutex.c:620
  in_atomic(): 1, irqs_disabled(): 0, pid: 4, name: kworker/0:0
  [..]
  Call Trace:
   dump_stack
   ___might_sleep
   __might_sleep
   mutex_lock_nested
   ? __lock_acquire
   nfit_handle_mce
   notifier_call_chain
   atomic_notifier_call_chain
   ? atomic_notifier_call_chain
   mce_gen_pool_process

Convert the notifier to a blocking one which gets to run only in process
context.

Boris: remove the notifier call in atomic context in print_mce(). For
now, let's print the MCE on the atomic path so that we can make sure
they go out and get logged at least.

Fixes: 6839a6d96f4e ("nfit: do an ARS scrub on hitting a latent media error")
Reported-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
Signed-off-by: Vishal Verma <vishal.l.verma@xxxxxxxxx>
Acked-by: Tony Luck <tony.luck@xxxxxxxxx>
Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
Cc: linux-edac <linux-edac@xxxxxxxxxxxxxxx>
Cc: x86-ml <x86@xxxxxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/20170411224457.24777-1-vishal.l.verma@xxxxxxxxx
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
 arch/x86/kernel/cpu/mcheck/mce-genpool.c  |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c          | 17 +++--------------
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 1e5a50c..217cd44 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
 	head = llist_reverse_order(head);
 	llist_for_each_entry_safe(node, tmp, head, llnode) {
 		mce = &node->mce;
-		atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+		blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 		gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
 	}
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 903043e..19592ba 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,7 +13,7 @@ enum severity_level {
 	MCE_PANIC_SEVERITY,
 };
 
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern struct blocking_notifier_head x86_mce_decoder_chain;
 
 #define ATTR_LEN		16
 #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5accfbd..af44ebe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -123,7 +123,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
  */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
@@ -220,7 +220,7 @@ void mce_register_decode_chain(struct notifier_block *nb)
 
 	WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
 
-	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
@@ -228,7 +228,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
 {
 	atomic_dec(&num_notifiers);
 
-	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 
@@ -321,18 +321,7 @@ static void __print_mce(struct mce *m)
 
 static void print_mce(struct mce *m)
 {
-	int ret = 0;
-
 	__print_mce(m);
-
-	/*
-	 * Print out human-readable details about the MCE error,
-	 * (if the CPU has an implementation for that)
-	 */
-	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-	if (ret == NOTIFY_STOP)
-		return;
-
 	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 }
 



[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]