[RFC PATCH 1/1] sched/rseq: Consider rseq abort in page fault handler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Consider rseq abort before emitting the SIGSEGV or SIGBUS signals from
the page fault handler.

This allows using membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
to abort rseq critical sections which include memory accesses to
memory which mapping can be munmap'd or mprotect'd after the
membarrier "rseq fence" without causing SIGSEGV or SIGBUS when the page
fault handler triggered by a faulting memory access within a rseq
critical section is preempted before handling the page fault.

The problematic scenario is:

CPU 0                          CPU 1
------------------------------------------------------------------
                               old_p = P
                               P = NULL
- rseq c.s. begins
- x = P
- if (x != NULL)
  - v = *x
    - page fault
    - preempted
                               membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ)
                               munmap(old_p) (or mprotect(old_p))
    - handle page fault
    - force_sig_fault(SIGSEGV)
    - rseq resume notifier
      - move IP to abort IP
  -> SIGSEGV handler runs.

This is solved by postponing the force_sig_fault() to return to
user-space when the page fault handler detects that rseq events will
cause the thread to call the rseq resume notifier before going back to
user-space. This allows the rseq resume notifier to load the userspace
memory pointed by rseq->rseq_cs to compare the IP with the rseq c.s.
range before either moving the IP to the abort handler or calling
force_sig_fault() with the parameters previously saved by the page fault
handler.

Add a new AT_RSEQ_FEATURE_FLAGS getauxval(3) to allow user-space to
query whether the kernel implements this behavior (flag:
RSEQ_FEATURE_PAGE_FAULT_ABORT).

Untested implementation submitted for early feedback.

Only x86 is implemented in this PoC.

Link: https://lore.kernel.org/lkml/CACT4Y+bXfekygoyhO7pCctjnL15=E=Zs31BUGXU0dk8d4rc1Cw@xxxxxxxxxxxxxx/
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Dmitry Vyukov <dvyukov@xxxxxxxxxx>
Cc: Peter Oskolkov <posk@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx>
Cc: Boqun Feng <boqun.feng@xxxxxxxxx>
Cc: Chris Kennelly <ckennelly@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
---
 arch/x86/mm/fault.c          |  4 ++--
 fs/binfmt_elf.c              |  1 +
 include/linux/sched.h        | 16 ++++++++++++++++
 include/linux/sched/signal.h | 24 ++++++++++++++++++++++++
 include/uapi/linux/auxvec.h  |  1 +
 include/uapi/linux/rseq.h    |  7 +++++++
 kernel/rseq.c                | 36 +++++++++++++++++++++++++++++++-----
 7 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 679b09cfe241..42ac39680cb6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -854,7 +854,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	if (si_code == SEGV_PKUERR)
 		force_sig_pkuerr((void __user *)address, pkey);
 	else
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+		rseq_lazy_force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 
 	local_irq_disable();
 }
@@ -973,7 +973,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 		return;
 	}
 #endif
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
+	rseq_lazy_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 }
 
 static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5397b552fbeb..8fece0911c7d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -273,6 +273,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 #ifdef CONFIG_RSEQ
 	NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end));
 	NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq));
+	NEW_AUX_ENT(AT_RSEQ_FEATURE_FLAGS, RSEQ_FEATURE_FLAGS);
 #endif
 #undef NEW_AUX_ENT
 	/* AT_NULL is zero; clear the rest too */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..39aa585ba2a3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -743,6 +743,15 @@ struct kmap_ctrl {
 #endif
 };
 
+#ifdef CONFIG_RSEQ
+struct rseq_lazy_sig {
+	bool pending;
+	int sig;
+	int code;
+	void __user *addr;
+};
+#endif
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -1317,6 +1326,7 @@ struct task_struct {
 	 * with respect to preemption.
 	 */
 	unsigned long rseq_event_mask;
+	struct rseq_lazy_sig rseq_lazy_sig;
 #endif
 
 #ifdef CONFIG_SCHED_MM_CID
@@ -2330,6 +2340,8 @@ unsigned long sched_cpu_util(int cpu);
 
 #ifdef CONFIG_RSEQ
 
+#define RSEQ_FEATURE_FLAGS	RSEQ_FEATURE_PAGE_FAULT_ABORT
+
 /*
  * Map the event mask on the user-space ABI enum rseq_cs_flags
  * for direct mask checks.
@@ -2390,6 +2402,8 @@ static inline void rseq_migrate(struct task_struct *t)
  */
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
 {
+	WARN_ON_ONCE(current->rseq_lazy_sig.pending);
+
 	if (clone_flags & CLONE_VM) {
 		t->rseq = NULL;
 		t->rseq_len = 0;
@@ -2405,6 +2419,8 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
 
 static inline void rseq_execve(struct task_struct *t)
 {
+	WARN_ON_ONCE(current->rseq_lazy_sig.pending);
+
 	t->rseq = NULL;
 	t->rseq_len = 0;
 	t->rseq_sig = 0;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..0d75dfde2f9b 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -781,4 +781,28 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
 
+#ifdef CONFIG_RSEQ
+
+static inline int rseq_lazy_force_sig_fault(int sig, int code, void __user *addr)
+{
+	struct task_struct *t = current;
+
+	if (!t->rseq_event_mask)
+		return force_sig_fault(sig, code, addr);
+	t->rseq_lazy_sig.pending = true;
+	t->rseq_lazy_sig.sig = sig;
+	t->rseq_lazy_sig.code = code;
+	t->rseq_lazy_sig.addr = addr;
+	return 0;
+}
+
+#else
+
+static inline int rseq_lazy_force_sig_fault(int sig, int code, void __user *addr)
+{
+	return force_sig_fault(sig, code, addr);
+}
+
+#endif
+
 #endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index 6991c4b8ab18..5044f367a219 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -32,6 +32,7 @@
 #define AT_HWCAP2 26	/* extension of AT_HWCAP */
 #define AT_RSEQ_FEATURE_SIZE	27	/* rseq supported feature size */
 #define AT_RSEQ_ALIGN		28	/* rseq allocation alignment */
+#define AT_RSEQ_FEATURE_FLAGS	29	/* rseq feature flags */
 
 #define AT_EXECFN  31	/* filename of program */
 
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..0fdb192e3cd3 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -37,6 +37,13 @@ enum rseq_cs_flags {
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
 };
 
+/*
+ * rseq feature flags. Query with getauxval(AT_RSEQ_FEATURE_FLAGS).
+ */
+enum rseq_feature_flags {
+	RSEQ_FEATURE_PAGE_FAULT_ABORT = (1U << 0),
+};
+
 /*
  * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
  * contained within a single cache-line. It is usually declared as
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 9de6e35fe679..f686a97abb45 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -271,6 +271,25 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
 	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
 }
 
+static void rseq_clear_lazy_sig_fault(struct task_struct *t)
+{
+	if (!t->rseq_lazy_sig.pending)
+		return;
+	t->rseq_lazy_sig.pending = false;
+	t->rseq_lazy_sig.sig = 0;
+	t->rseq_lazy_sig.code = 0;
+	t->rseq_lazy_sig.addr = NULL;
+}
+
+static void rseq_force_lazy_sig_fault(struct task_struct *t)
+{
+	if (!t->rseq_lazy_sig.pending)
+		return;
+	force_sig_fault(t->rseq_lazy_sig.sig, t->rseq_lazy_sig.code,
+			t->rseq_lazy_sig.addr);
+	rseq_clear_lazy_sig_fault(t);
+}
+
 static int rseq_ip_fixup(struct pt_regs *regs)
 {
 	unsigned long ip = instruction_pointer(regs);
@@ -280,25 +299,32 @@ static int rseq_ip_fixup(struct pt_regs *regs)
 
 	ret = rseq_get_rseq_cs(t, &rseq_cs);
 	if (ret)
-		return ret;
+		goto nofixup;
 
 	/*
 	 * Handle potentially not being within a critical section.
 	 * If not nested over a rseq critical section, restart is useless.
 	 * Clear the rseq_cs pointer and return.
 	 */
-	if (!in_rseq_cs(ip, &rseq_cs))
-		return clear_rseq_cs(t);
+	if (!in_rseq_cs(ip, &rseq_cs)) {
+		ret = clear_rseq_cs(t);
+		goto nofixup;
+	}
 	ret = rseq_need_restart(t, rseq_cs.flags);
 	if (ret <= 0)
-		return ret;
+		goto nofixup;
 	ret = clear_rseq_cs(t);
 	if (ret)
-		return ret;
+		goto nofixup;
+	rseq_clear_lazy_sig_fault(t);
 	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
 			    rseq_cs.abort_ip);
 	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
 	return 0;
+
+nofixup:
+	rseq_force_lazy_sig_fault(t);
+	return ret;
 }
 
 /*
-- 
2.39.2





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux