Thank you for cc'ing me. You probably want to hold off on back-porting this patch. The appropriate fix requires some more conversation. At a mininum this patch should not be using TIF_NOTIFY_RESUME. Eric Sasha Levin <sashal@xxxxxxxxxx> writes: > From: Oleg Nesterov <oleg@xxxxxxxxxx> > > [ Upstream commit bf9ad37dc8a30cce22ae95d6c2ca6abf8731d305 ] > > On x86_64 we must disable preemption before we enable interrupts > for stack faults, int3 and debugging, because the current task is using > a per CPU debug stack defined by the IST. If we schedule out, another task > can come in and use the same stack and cause the stack to be corrupted > and crash the kernel on return. > > When CONFIG_PREEMPT_RT is enabled, spinlock_t locks become sleeping, and > one of these is the spin lock used in signal handling. > > Some of the debug code (int3) causes do_trap() to send a signal. > This function calls a spinlock_t lock that has been converted to a > sleeping lock. If this happens, the above issues with the corrupted > stack is possible. > > Instead of calling the signal right away, for PREEMPT_RT and x86, > the signal information is stored on the stacks task_struct and > TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume > code will send the signal when preemption is enabled. > > [ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT to > ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ] > [bigeasy: Add on 32bit as per Yang Shi, minor rewording. ] > [ tglx: Use a config option ] > > Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx> > Signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx> > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Link: https://lore.kernel.org/r/Ygq5aBB/qMQw6aP5@xxxxxxxxxxxxx > Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx> > --- > arch/x86/Kconfig | 1 + > include/linux/sched.h | 3 +++ > kernel/Kconfig.preempt | 12 +++++++++++- > kernel/entry/common.c | 14 ++++++++++++++ > kernel/signal.c | 40 ++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 69 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 9f5bd41bf660..d557ac29b6cd 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -120,6 +120,7 @@ config X86 > select ARCH_WANTS_NO_INSTR > select ARCH_WANT_HUGE_PMD_SHARE > select ARCH_WANT_LD_ORPHAN_WARN > + select ARCH_WANTS_RT_DELAYED_SIGNALS > select ARCH_WANTS_THP_SWAP if X86_64 > select ARCH_HAS_PARANOID_L1D_FLUSH > select BUILDTIME_TABLE_SORT > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 75ba8aa60248..098e37fd770a 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1087,6 +1087,9 @@ struct task_struct { > /* Restored if set_restore_sigmask() was used: */ > sigset_t saved_sigmask; > struct sigpending pending; > +#ifdef CONFIG_RT_DELAYED_SIGNALS > + struct kernel_siginfo forced_info; > +#endif > unsigned long sas_ss_sp; > size_t sas_ss_size; > unsigned int sas_ss_flags; > diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt > index ce77f0265660..5644abd5f8a8 100644 > --- a/kernel/Kconfig.preempt > +++ b/kernel/Kconfig.preempt > @@ -132,4 +132,14 @@ config SCHED_CORE > which is the likely usage by Linux distributions, there should > be no measurable impact on performance. > > - > +config ARCH_WANTS_RT_DELAYED_SIGNALS > + bool > + help > + This option is selected by architectures where raising signals > + can happen in atomic contexts on PREEMPT_RT enabled kernels. This > + option delays raising the signal until the return to user space > + loop where it is also delivered. X86 requires this to deliver > + signals from trap handlers which run on IST stacks. > + > +config RT_DELAYED_SIGNALS > + def_bool PREEMPT_RT && ARCH_WANTS_RT_DELAYED_SIGNALS > diff --git a/kernel/entry/common.c b/kernel/entry/common.c > index bad713684c2e..0543a2c92f20 100644 > --- a/kernel/entry/common.c > +++ b/kernel/entry/common.c > @@ -148,6 +148,18 @@ static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) > arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); > } > > +#ifdef CONFIG_RT_DELAYED_SIGNALS > +static inline void raise_delayed_signal(void) > +{ > + if (unlikely(current->forced_info.si_signo)) { > + force_sig_info(¤t->forced_info); > + current->forced_info.si_signo = 0; > + } > +} > +#else > +static inline void raise_delayed_signal(void) { } > +#endif > + > static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > unsigned long ti_work) > { > @@ -162,6 +174,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > if (ti_work & _TIF_NEED_RESCHED) > schedule(); > > + raise_delayed_signal(); > + > if (ti_work & _TIF_UPROBE) > uprobe_notify_resume(regs); > > diff --git a/kernel/signal.c b/kernel/signal.c > index 9b04631acde8..e93de6daa188 100644 > --- a/kernel/signal.c > +++ b/kernel/signal.c > @@ -1307,6 +1307,43 @@ enum sig_handler { > HANDLER_EXIT, /* Only visible as the process exit code */ > }; > > +/* > + * On some archictectures, PREEMPT_RT has to delay sending a signal from a > + * trap since it cannot enable preemption, and the signal code's > + * spin_locks turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME > + * which will send the signal on exit of the trap. > + */ > +#ifdef CONFIG_RT_DELAYED_SIGNALS > +static inline bool force_sig_delayed(struct kernel_siginfo *info, > + struct task_struct *t) > +{ > + if (!in_atomic()) > + return false; > + > + if (WARN_ON_ONCE(t->forced_info.si_signo)) > + return true; > + > + if (is_si_special(info)) { > + WARN_ON_ONCE(info != SEND_SIG_PRIV); > + t->forced_info.si_signo = info->si_signo; > + t->forced_info.si_errno = 0; > + t->forced_info.si_code = SI_KERNEL; > + t->forced_info.si_pid = 0; > + t->forced_info.si_uid = 0; > + } else { > + t->forced_info = *info; > + } > + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); > + return true; > +} > +#else > +static inline bool force_sig_delayed(struct kernel_siginfo *info, > + struct task_struct *t) > +{ > + return false; > +} > +#endif > + > /* > * Force a signal that the process can't ignore: if necessary > * we unblock the signal and change any SIG_IGN to SIG_DFL. > @@ -1327,6 +1364,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, > struct k_sigaction *action; > int sig = info->si_signo; > > + if (force_sig_delayed(info, t)) > + return 0; > + > spin_lock_irqsave(&t->sighand->siglock, flags); > action = &t->sighand->action[sig-1]; > ignored = action->sa.sa_handler == SIG_IGN;