On Fri, 3 May 2019 16:07:59 -0700 Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote: > static struct pt_regs *emulate_call(struct pt_regs *regs, unsigned > long return, unsigned long target) > { > #ifdef CONFIG_X86_32 > /* BIG comment about how we need to move pt_regs to make > room and to update the return 'sp' */ > struct pt_regs *new = (void *)regs - 4; > unsigned long *sp = (unsigned long *)(new + 1); I tried this, and it crashed. After a bit of debugging, I found that the issue is that we can't use sizeof(*regs), because we don't have a full pt_regs. We are missing the first entry (which is why we have kernel_stack_pointer() in the first place!) > memmove(new, regs, sizeof(*regs)); This would need to be something like: #define SAVED_REGS_SIZE (sizeof(struct pt_regs) - 4) struct pt_regs *new = (void *)regs - 4; unsigned long *sp = (unsigned long *) ((void *)new + SAVED_REGS_SIZE); memmove(new, regs, SAVED_REGS_SIZE); > regs = new; > #else > unsigned long *sp = regs->sp; > regs->sp -= 4; should be: unsigned long *sp; regs->sp -= sizeof(long); sp = (unsigned long *)regs->sp; > #endif > *sp = value; > regs->ip = target; > return regs; > } I got it sorta working. And it appears that printk() no longer flushes partial lines, and I couldn't figure out where my self tests broke as it does: pr_info("Testing ....: "); ret = do_test(); if (ret) pr_info("PASSED\n"); else pr_info("FAILED\n"); but because it doesn't flush anymore, I don't see what test it is :-p But that's another issue. With some "early_printk" in the code, I do see it doing the calls, but for some reason, the start up test never moves forward. Below is the patch that I tried. I started with Peter's changes (this works on x86_64) and tried to work your ideas in for x86_32. This still gets stuck (but doesn't crash). Also note that this method prevents the die notifiers from doing this, which means I had to disable Peter's selftest because that no longer will work the way it is written. -- Steve diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d309f30cf7af..f0fd3e17bedd 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -67,9 +67,20 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all_kernel #endif +.macro RETINT_PREEMPT +#ifdef CONFIG_PREEMPT + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz .Lend_\@ + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz .Lend_\@ + call preempt_schedule_irq +.Lend_\@: +#endif +.endm + .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? @@ -377,6 +388,7 @@ #define CS_FROM_ENTRY_STACK (1 << 31) #define CS_FROM_USER_CR3 (1 << 30) +#define CS_FROM_INT3 (1 << 29) .macro SWITCH_TO_KERNEL_STACK @@ -753,7 +765,7 @@ ret_from_intr: andl $SEGMENT_RPL_MASK, %eax #endif cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + jb restore_all_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) @@ -763,19 +775,6 @@ ENTRY(resume_userspace) jmp restore_all END(ret_from_exception) -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) -.Lneed_resched: - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all_kernel - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all_kernel - call preempt_schedule_irq - jmp .Lneed_resched -END(resume_kernel) -#endif - GLOBAL(__begin_SYSENTER_singlestep_region) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1026,6 +1025,7 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: + RETINT_PREEMPT TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE BUG_IF_WRONG_CR3 @@ -1476,6 +1476,27 @@ END(nmi) ENTRY(int3) ASM_CLAC + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, 8(%esp) + jnz .Lfrom_usermode_no_gap +#endif + testl $SEGMENT_RPL_MASK, 4(%esp) + jnz .Lfrom_usermode_no_gap + + pushl $-1 # mark this as an int + + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + subl $8, %esp + call do_kernel_int3 + movl %eax, %esp + jmp ret_from_exception + +.Lfrom_usermode_no_gap: + pushl $-1 # mark this as an int SAVE_ALL switch_stacks=1 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..834ec1397dab 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -899,6 +899,16 @@ ENTRY(\sym) jnz .Lfrom_usermode_switch_stack_\@ .endif + .if \create_gap == 1 + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: + .endif + .if \paranoid call paranoid_entry .else @@ -1130,7 +1140,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 +idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN_PV diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index cf350639e76d..4b335ac5afcc 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -37,7 +37,7 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; -int ftrace_int3_handler(struct pt_regs *regs); +int ftrace_int3_handler(struct pt_regs **regs); #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index e85ff65c43c3..8e638231cf52 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -39,4 +39,33 @@ extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); extern int after_bootmem; +static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + +#define INT3_INSN_SIZE 1 +#define CALL_INSN_SIZE 5 + +static inline struct pt_regs * int3_emulate_call(struct pt_regs *regs, unsigned long func) +{ +#ifdef CONFIG_X86_32 +#define SAVED_REGS_SIZE (sizeof(struct pt_regs) - 8) + /* BIG comment about how we need to move pt_regs to make + room and to update the return 'sp' */ + struct pt_regs *new = (void *)regs - sizeof(long); + unsigned long *sp = (unsigned long *) + ((void *)new + SAVED_REGS_SIZE); + memmove(new, regs, SAVED_REGS_SIZE); + regs = new; +#else + unsigned long *sp; + regs->sp -= sizeof(long); + sp = (unsigned long *)regs->sp; +#endif + *sp = regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE; + regs->ip = func; + return regs; +} + #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..c0e9002e2708 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -613,11 +613,84 @@ extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ +/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ +static void __init int3_magic(unsigned int *ptr) +{ + *ptr = 1; +} + +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +static void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + return; + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * Stick the address of the INT3 instruction into int3_selftest_ip, + * then trigger the INT3, padded with NOPs to match a CALL instruction + * length. + */ + asm volatile ("1: int3; nop; nop; nop; nop\n\t" + ".pushsection .init.data,\"aw\"\n\t" + ".align " __ASM_SEL(4, 8) "\n\t" + ".type int3_selftest_ip, @object\n\t" + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" + "int3_selftest_ip:\n\t" + __ASM_SEL(.long, .quad) " 1b\n\t" + ".popsection\n\t" + : : __ASM_SEL_RAW(a, D) (&val) : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + void __init alternative_instructions(void) { - /* The patching is not fully atomic, so try to avoid local interruptions - that might execute the to be patched code. - Other CPUs are not running. */ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ stop_nmi(); /* @@ -642,10 +715,11 @@ void __init alternative_instructions(void) _text, _etext); } - if (!uniproc_patched || num_possible_cpus() == 1) + if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); + } #endif apply_paravirt(__parainstructions, __parainstructions_end); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb2..c5e2ae1efbd8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -29,6 +29,7 @@ #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE @@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, } static unsigned long ftrace_update_func; +static unsigned long ftrace_update_func_call; static int update_ftrace_func(unsigned long ip, void *new) { @@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned char *new; int ret; + ftrace_update_func_call = (unsigned long)func; + new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); @@ -287,20 +291,30 @@ static nokprobe_inline int is_ftrace_caller(unsigned long ip) * call to a nop. While the change is taking place, we treat * it just like it was a nop. */ -int ftrace_int3_handler(struct pt_regs *regs) +int ftrace_int3_handler(struct pt_regs **pregs) { + struct pt_regs *regs = *pregs; unsigned long ip; if (WARN_ON_ONCE(!regs)) return 0; - ip = regs->ip - 1; - if (!ftrace_location(ip) && !is_ftrace_caller(ip)) - return 0; + ip = regs->ip - INT3_INSN_SIZE; - regs->ip += MCOUNT_INSN_SIZE - 1; + if (ftrace_location(ip)) { + *pregs = int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); +// early_printk("ip=%pS was=%px is=%px\n", (void *)ip, regs, *pregs); + return 1; + } else if (is_ftrace_caller(ip)) { + if (!ftrace_update_func_call) { + int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); + return 1; + } + *pregs = int3_emulate_call(regs, ftrace_update_func_call); + return 1; + } - return 1; + return 0; } NOKPROBE_SYMBOL(ftrace_int3_handler); @@ -859,6 +873,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) func = ftrace_ops_get_func(ops); + ftrace_update_func_call = (unsigned long)func; + /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); @@ -960,6 +976,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) { unsigned char *new; + ftrace_update_func_call = 0UL; new = ftrace_jmp_replace(ip, (unsigned long)func); return update_ftrace_func(ip, new); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4b8ee05dd6ad..03ba69a6e594 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -163,6 +163,9 @@ static inline bool invalid_selector(u16 value) * stack pointer we fall back to regs as stack if no previous stack * exists. * + * There is a special case for INT3, there we construct a full pt_regs + * environment. We can detect this case by a high bit in regs->cs + * * This is valid only for kernel mode traps. */ unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d26f9e9c3d83..17e0c56326c9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,7 +570,7 @@ do_general_protection(struct pt_regs *regs, long error_code) } NOKPROBE_SYMBOL(do_general_protection); -dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +dotraplinkage struct pt_regs * notrace do_kernel_int3(struct pt_regs *regs) { #ifdef CONFIG_DYNAMIC_FTRACE /* @@ -578,11 +578,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * See note by declaration of modifying_ftrace_code in ftrace.c */ if (unlikely(atomic_read(&modifying_ftrace_code)) && - ftrace_int3_handler(regs)) - return; + ftrace_int3_handler(®s)) + return regs; #endif if (poke_int3_handler(regs)) - return; + return regs; /* * Use ist_enter despite the fact that we don't use an IST stack. @@ -594,7 +594,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) ist_enter(regs); RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ @@ -604,16 +604,26 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) goto exit; #endif - if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + if (notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); exit: ist_exit(regs); + return regs; +} +NOKPROBE_SYMBOL(do_kernel_int3); + +dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +{ + struct pt_regs *ret_regs; + + ret_regs = do_kernel_int3(regs); + WARN_ON_ONCE(ret_regs != regs); } NOKPROBE_SYMBOL(do_int3);