A number of exceptions can re-enable IRQs and schedule (provided the context taking the exception has IRQs enabled etc.). Currently this isn't standardized and each architecture does it slightly different. (Notably, ARM64 restores/inherits more than just the IRQ state, suggesting at least some form of arch_ hook might be appropriate on top of this). Create a single set of functions for this and convert x86 to use these. The purpose is to have a single (common) place to hook in order to cover all the exceptions that can schedule(). Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> --- arch/x86/kernel/traps.c | 48 ++++++++++++++++--------------------------- arch/x86/mm/fault.c | 28 ++++++++++++------------- include/linux/entry-common.h | 24 +++++++++++++++++++++ 3 files changed, 56 insertions(+), 44 deletions(-) --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -73,18 +73,6 @@ DECLARE_BITMAP(system_vectors, NR_VECTORS); -static inline void cond_local_irq_enable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void cond_local_irq_disable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) @@ -177,9 +165,9 @@ static void do_error_trap(struct pt_regs if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { - cond_local_irq_enable(regs); + irqentry_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); - cond_local_irq_disable(regs); + irqentry_irq_disable(regs); } } @@ -300,7 +288,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_ if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); - local_irq_enable(); + irqentry_irq_enable(regs); if (handle_user_split_lock(regs, error_code)) goto out; @@ -309,7 +297,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_ error_code, BUS_ADRALN, NULL); out: - local_irq_disable(); + irqentry_irq_disable(regs); } #ifdef CONFIG_VMAP_STACK @@ -473,14 +461,14 @@ DEFINE_IDTENTRY(exc_bounds) if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; - cond_local_irq_enable(regs); + irqentry_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); - cond_local_irq_disable(regs); + irqentry_irq_disable(regs); } enum kernel_gp_hint { @@ -567,7 +555,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr unsigned long gp_addr; int ret; - cond_local_irq_enable(regs); + irqentry_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) @@ -638,7 +626,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr die_addr(desc, regs, error_code, gp_addr); exit: - cond_local_irq_disable(regs); + irqentry_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) @@ -665,9 +653,9 @@ static void do_int3_user(struct pt_regs if (do_int3(regs)) return; - cond_local_irq_enable(regs); + irqentry_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); - cond_local_irq_disable(regs); + irqentry_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) @@ -1003,7 +991,7 @@ static __always_inline void exc_debug_us goto out; /* It's safe to allow irq's after DR6 has been saved */ - local_irq_enable(); + irqentry_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); @@ -1020,7 +1008,7 @@ static __always_inline void exc_debug_us send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: - local_irq_disable(); + irqentry_irq_disable(regs); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); @@ -1064,7 +1052,7 @@ static void math_error(struct pt_regs *r char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; - cond_local_irq_enable(regs); + irqentry_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) @@ -1099,7 +1087,7 @@ static void math_error(struct pt_regs *r force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: - cond_local_irq_disable(regs); + irqentry_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) @@ -1160,7 +1148,7 @@ static bool handle_xfd_event(struct pt_r if (WARN_ON(!user_mode(regs))) return false; - local_irq_enable(); + irqentry_irq_enable(regs); err = xfd_enable_feature(xfd_err); @@ -1173,7 +1161,7 @@ static bool handle_xfd_event(struct pt_r break; } - local_irq_disable(); + irqentry_irq_disable(regs); return true; } @@ -1188,12 +1176,12 @@ DEFINE_IDTENTRY(exc_device_not_available if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; - cond_local_irq_enable(regs); + irqentry_irq_enable(regs); info.regs = regs; math_emulate(&info); - cond_local_irq_disable(regs); + irqentry_irq_disable(regs); return; } #endif --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1209,6 +1209,12 @@ do_kern_addr_fault(struct pt_regs *regs, NOKPROBE_SYMBOL(do_kern_addr_fault); /* + * EFLAGS[3] is unused and ABI defined to be 0, use it to store IRQ state, + * because do_user_addr_fault() is too convoluted to track things. + */ +#define X86_EFLAGS_MISC (1UL << 3) + +/* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost * all purposes, we should treat a normal kernel access to user memory @@ -1290,13 +1296,11 @@ void do_user_addr_fault(struct pt_regs * * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ - if (user_mode(regs)) { - local_irq_enable(); + if (user_mode(regs)) flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); - } + + irqentry_irq_enable(regs); + regs->flags |= X86_EFLAGS_MISC; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); @@ -1483,14 +1487,10 @@ handle_page_fault(struct pt_regs *regs, do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); - /* - * User address page fault handling might have reenabled - * interrupts. Fixing up all potential exit points of - * do_user_addr_fault() and its leaf functions is just not - * doable w/o creating an unholy mess or turning the code - * upside down. - */ - local_irq_disable(); + if (regs->flags & X86_EFLAGS_MISC) { + regs->flags &= ~X86_EFLAGS_MISC; + irqentry_irq_disable(regs); + } } } --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -7,6 +7,7 @@ #include <linux/syscalls.h> #include <linux/seccomp.h> #include <linux/sched.h> +#include <asm/ptrace.h> #include <asm/entry-common.h> @@ -213,6 +214,29 @@ static inline void local_irq_disable_exi #endif /** + * irqentry_irq_enable - Conditionally enable IRQs from exceptions + * + * Common code for exceptions to (re)enable IRQs, typically done to allow + * from-user exceptions to schedule (since they run on the task stack). + */ +static inline void irqentry_irq_enable(struct pt_regs *regs) +{ + if (!regs_irqs_disabled(regs)) + local_irq_enable(); +} + +/** + * irqentry_irq_disable - Conditionally disable IRQs from exceptions + * + * Counterpart of irqentry_irq_enable(). + */ +static inline void irqentry_irq_disable(struct pt_regs *regs) +{ + if (!regs_irqs_disabled(regs)) + local_irq_disable(); +} + +/** * arch_exit_to_user_mode_work - Architecture specific TIF work for exit * to user mode. * @regs: Pointer to currents pt_regs