Consider rseq abort before emitting the SIGSEGV or SIGBUS signals from the page fault handler. This allows using membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ to abort rseq critical sections which include memory accesses to memory which mapping can be munmap'd or mprotect'd after the membarrier "rseq fence" without causing SIGSEGV or SIGBUS when the page fault handler triggered by a faulting memory access within a rseq critical section is preempted before handling the page fault. The problematic scenario is: CPU 0 CPU 1 ------------------------------------------------------------------ old_p = P P = NULL - rseq c.s. begins - x = P - if (x != NULL) - v = *x - page fault - preempted membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ) munmap(old_p) (or mprotect(old_p)) - handle page fault - force_sig_fault(SIGSEGV) - rseq resume notifier - move IP to abort IP -> SIGSEGV handler runs. This is solved by postponing the force_sig_fault() to return to user-space when the page fault handler detects that rseq events will cause the thread to call the rseq resume notifier before going back to user-space. This allows the rseq resume notifier to load the userspace memory pointed by rseq->rseq_cs to compare the IP with the rseq c.s. range before either moving the IP to the abort handler or calling force_sig_fault() with the parameters previously saved by the page fault handler. Add a new AT_RSEQ_FEATURE_FLAGS getauxval(3) to allow user-space to query whether the kernel implements this behavior (flag: RSEQ_FEATURE_PAGE_FAULT_ABORT). Untested implementation submitted for early feedback. Only x86 is implemented in this PoC. Link: https://lore.kernel.org/lkml/CACT4Y+bXfekygoyhO7pCctjnL15=E=Zs31BUGXU0dk8d4rc1Cw@xxxxxxxxxxxxxx/ Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> Cc: Dmitry Vyukov <dvyukov@xxxxxxxxxx> Cc: Peter Oskolkov <posk@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx> Cc: Boqun Feng <boqun.feng@xxxxxxxxx> Cc: Chris Kennelly <ckennelly@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Borislav Petkov <bp@xxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> Cc: linux-mm@xxxxxxxxx --- arch/x86/mm/fault.c | 4 ++-- fs/binfmt_elf.c | 1 + include/linux/sched.h | 16 ++++++++++++++++ include/linux/sched/signal.h | 24 ++++++++++++++++++++++++ include/uapi/linux/auxvec.h | 1 + include/uapi/linux/rseq.h | 7 +++++++ kernel/rseq.c | 36 +++++++++++++++++++++++++++++++----- 7 files changed, 82 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..42ac39680cb6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -854,7 +854,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); else - force_sig_fault(SIGSEGV, si_code, (void __user *)address); + rseq_lazy_force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } @@ -973,7 +973,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, return; } #endif - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); + rseq_lazy_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..8fece0911c7d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -273,6 +273,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, #ifdef CONFIG_RSEQ NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); + NEW_AUX_ENT(AT_RSEQ_FEATURE_FLAGS, RSEQ_FEATURE_FLAGS); #endif #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..39aa585ba2a3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -743,6 +743,15 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_RSEQ +struct rseq_lazy_sig { + bool pending; + int sig; + int code; + void __user *addr; +}; +#endif + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -1317,6 +1326,7 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; + struct rseq_lazy_sig rseq_lazy_sig; #endif #ifdef CONFIG_SCHED_MM_CID @@ -2330,6 +2340,8 @@ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_RSEQ +#define RSEQ_FEATURE_FLAGS RSEQ_FEATURE_PAGE_FAULT_ABORT + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -2390,6 +2402,8 @@ static inline void rseq_migrate(struct task_struct *t) */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { + WARN_ON_ONCE(current->rseq_lazy_sig.pending); + if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_len = 0; @@ -2405,6 +2419,8 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) static inline void rseq_execve(struct task_struct *t) { + WARN_ON_ONCE(current->rseq_lazy_sig.pending); + t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..0d75dfde2f9b 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -781,4 +781,28 @@ static inline unsigned long rlimit_max(unsigned int limit) return task_rlimit_max(current, limit); } +#ifdef CONFIG_RSEQ + +static inline int rseq_lazy_force_sig_fault(int sig, int code, void __user *addr) +{ + struct task_struct *t = current; + + if (!t->rseq_event_mask) + return force_sig_fault(sig, code, addr); + t->rseq_lazy_sig.pending = true; + t->rseq_lazy_sig.sig = sig; + t->rseq_lazy_sig.code = code; + t->rseq_lazy_sig.addr = addr; + return 0; +} + +#else + +static inline int rseq_lazy_force_sig_fault(int sig, int code, void __user *addr) +{ + return force_sig_fault(sig, code, addr); +} + +#endif + #endif /* _LINUX_SCHED_SIGNAL_H */ diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h index 6991c4b8ab18..5044f367a219 100644 --- a/include/uapi/linux/auxvec.h +++ b/include/uapi/linux/auxvec.h @@ -32,6 +32,7 @@ #define AT_HWCAP2 26 /* extension of AT_HWCAP */ #define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */ #define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */ +#define AT_RSEQ_FEATURE_FLAGS 29 /* rseq feature flags */ #define AT_EXECFN 31 /* filename of program */ diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index c233aae5eac9..0fdb192e3cd3 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -37,6 +37,13 @@ enum rseq_cs_flags { (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; +/* + * rseq feature flags. Query with getauxval(AT_RSEQ_FEATURE_FLAGS). + */ +enum rseq_feature_flags { + RSEQ_FEATURE_PAGE_FAULT_ABORT = (1U << 0), +}; + /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..f686a97abb45 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -271,6 +271,25 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } +static void rseq_clear_lazy_sig_fault(struct task_struct *t) +{ + if (!t->rseq_lazy_sig.pending) + return; + t->rseq_lazy_sig.pending = false; + t->rseq_lazy_sig.sig = 0; + t->rseq_lazy_sig.code = 0; + t->rseq_lazy_sig.addr = NULL; +} + +static void rseq_force_lazy_sig_fault(struct task_struct *t) +{ + if (!t->rseq_lazy_sig.pending) + return; + force_sig_fault(t->rseq_lazy_sig.sig, t->rseq_lazy_sig.code, + t->rseq_lazy_sig.addr); + rseq_clear_lazy_sig_fault(t); +} + static int rseq_ip_fixup(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); @@ -280,25 +299,32 @@ static int rseq_ip_fixup(struct pt_regs *regs) ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) - return ret; + goto nofixup; /* * Handle potentially not being within a critical section. * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ - if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + if (!in_rseq_cs(ip, &rseq_cs)) { + ret = clear_rseq_cs(t); + goto nofixup; + } ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) - return ret; + goto nofixup; ret = clear_rseq_cs(t); if (ret) - return ret; + goto nofixup; + rseq_clear_lazy_sig_fault(t); trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); return 0; + +nofixup: + rseq_force_lazy_sig_fault(t); + return ret; } /* -- 2.39.2