[PATCH 1/2] x86/mce: Avoid infinite loop for copy from user recovery

Tony Luck <tony.luck@xxxxxxxxx> · Fri, 8 Jan 2021 14:22:50 -0800

Recovery action when get_user() triggers a machine check uses the fixup
path to make get_user() return -EFAULT.  Also queue_task_work() sets up
so that kill_me_maybe() will be called on return to user mode to send a
SIGBUS to the current process.

But there are places in the kernel where the code assumes that this
EFAULT return was simply because of a page fault. The code takes some
action to fix that, and then retries the access. This results in a second
machine check.

While processing this second machine check queue_task_work() is called
again. But since this uses the same callback_head structure that
was used in the first call, the net result is an entry on the
current->task_works list that points to itself. When task_work_run()
is called it loops forever in this code:

		do {
			next = work->next;
			work->func(work);
			work = next;
			cond_resched();
		} while (work);

Add a "mce_busy" flag bit to detect this situation and panic
when it happens.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
---
 arch/x86/kernel/cpu/mce/core.c | 7 ++++++-
 include/linux/sched.h          | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 13d3f1cbda17..1bf11213e093 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1246,6 +1246,7 @@ static void kill_me_maybe(struct callback_head *cb)
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
 
+	p->mce_busy = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 
 	if (!p->mce_ripv)
@@ -1268,6 +1269,7 @@ static void kill_me_maybe(struct callback_head *cb)
 
 static void queue_task_work(struct mce *m, int kill_current_task)
 {
+	current->mce_busy = 1;
 	current->mce_addr = m->addr;
 	current->mce_kflags = m->kflags;
 	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
@@ -1431,8 +1433,11 @@ noinstr void do_machine_check(struct pt_regs *regs)
 				mce_panic("Failed kernel mode recovery", &m, msg);
 		}
 
-		if (m.kflags & MCE_IN_KERNEL_COPYIN)
+		if (m.kflags & MCE_IN_KERNEL_COPYIN) {
+			if (current->mce_busy)
+				mce_panic("Multiple copyin", &m, msg);
 			queue_task_work(&m, kill_current_task);
+		}
 	}
 out:
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e3a5eeec509..a763a76eac57 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1360,7 +1360,8 @@ struct task_struct {
 	u64				mce_addr;
 	__u64				mce_ripv : 1,
 					mce_whole_page : 1,
-					__mce_reserved : 62;
+					mce_busy : 1,
+					__mce_reserved : 61;
 	struct callback_head		mce_kill_me;
 #endif
 
-- 
2.21.1