Re: [RFC][PATCH 1/2] x86: Allow breakpoints to emulate call functions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 2 May 2019 22:21:46 +0200
Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:

> On Thu, May 02, 2019 at 11:43:37AM -0700, Linus Torvalds wrote:
> > What would it look like with the "int3-from-kernel is special" modification?  
> 
> Something like so; it boots; but I could've made some horrible mistake
> (again).
> 
> ---
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 7b23431be5cb..4de51cff5b8a 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S

Oh, and so close!

I was running this on my i386 tests and for test 8 of 9 (passed 1-7) I
hit this:

Testing all events: OK
Testing ftrace filter: OK
trace_kprobe: Testing kprobe tracing: 
BUG: unable to handle kernel paging request at 10ec839c
#PF error: [INSTR]
*pdpt = 0000000000000000 *pde = 0000000000000000 
Oops: 0010 [#1] SMP PTI
CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.1.0-rc3-test+ #1
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
EIP: 0x10ec839c
Code: Bad RIP value.
EAX: 54e24bbc EBX: 00000000 ECX: 00000003 EDX: 00000002
ESI: ebcae400 EDI: c1513a88 EBP: ee641f30 ESP: c0439a07
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210202
CR0: 80050033 CR2: 10ec8392 CR3: 0167e000 CR4: 001406f0
Call Trace:
Modules linked in:
CR2: 0000000010ec839c
---[ end trace 8aa996061578b437 ]---
EIP: 0x10ec839c
Code: Bad RIP value.
EAX: 54e24bbc EBX: 00000000 ECX: 00000003 EDX: 00000002
ESI: ebcae400 EDI: c1513a88 EBP: ee641f30 ESP: c16834bc
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210202
CR0: 80050033 CR2: 10ec8392 CR3: 0167e000 CR4: 001406f0
Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
Kernel Offset: disabled
---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
------------[ cut here ]------------
sched: Unexpected reschedule of offline CPU#3!
WARNING: CPU: 1 PID: 1 at arch/x86/kernel/smp.c:128 native_smp_send_reschedule+0x1c/0x37
Modules linked in:
CPU: 1 PID: 1 Comm: swapper/0 Tainted: G      D           5.1.0-rc3-test+ #1
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
EIP: native_smp_send_reschedule+0x1c/0x37
Code: 4a 18 ba fb 00 00 00 e8 71 d6 9c 00 5d c3 55 89 e5 e8 37 6f 00 00 0f a3 05 e0 20 44 c1 72 11 50 68 d2 75 15 c1 e8 c7 a4 01 00 <0f> 0b 58 5a eb 13 8b 15 c0 36 28 c1 8b 4a 18 ba fd 00 00 00 e8 3a
EAX: 0000002e EBX: 00000003 ECX: c049977e EDX: ee638000
ESI: 00000003 EDI: 00000003 EBP: ee641e04 ESP: ee641dfc
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210092
CR0: 80050033 CR2: 10ec8392 CR3: 0167e000 CR4: 001406f0
Call Trace:
 kick_ilb+0x77/0x7c
 trigger_load_balance+0x279/0x280
 ? __pick_first_entity+0x18/0x18
 scheduler_tick+0x90/0xa9
 ? __pick_first_entity+0x18/0x18
 update_process_times+0x3f/0x4a
 tick_sched_handle+0x4c/0x5a
 tick_sched_timer+0x3b/0x79
 ? tick_sched_do_timer+0x44/0x44
 __hrtimer_run_queues+0x180/0x26a
 ? tick_sched_do_timer+0x44/0x44
 hrtimer_interrupt+0xa6/0x18e
 ? rcu_irq_enter+0x60/0x7e
 smp_apic_timer_interrupt+0xdf/0x179
 apic_timer_interrupt+0xda/0xe0
EIP: panic+0x208/0x24a
Code: 83 c3 64 eb ad 83 3d a8 d1 68 c1 00 74 05 e8 01 bf 01 00 68 e0 d1 68 c1 68 cc e6 15 c1 e8 bd e7 04 00 e8 02 ff 0a 00 fb 31 db <58> 5a e8 c5 a9 09 00 39 fb 7c 18 83 f6 01 8b 15 a0 d1 68 c1 89 f0
EAX: c044c764 EBX: 00000000 ECX: c049977e EDX: 318e4000
ESI: 00000000 EDI: 00000000 EBP: ee641f6c ESP: ee641f54
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00200246
 ? console_unlock+0x466/0x4b8
 ? panic+0x205/0x24a
 ? do_exit+0x4bd/0x8d1
 do_exit+0x4e3/0x8d1
 ? copy_oldmem_page+0x63/0x9b
 rewind_stack_do_exit+0x11/0x13
irq event stamp: 1400780
hardirqs last  enabled at (1400779): [<c0401568>] trace_hardirqs_on_thunk+0xc/0x10
hardirqs last disabled at (1400780): [<c0401578>] trace_hardirqs_off_thunk+0xc/0x10
softirqs last  enabled at (1400760): [<c0dfefb2>] __do_softirq+0x2a2/0x2d2
softirqs last disabled at (1400753): [<c0416fb7>] call_on_stack+0x45/0x4b
---[ end trace 8aa996061578b438 ]---
------------[ cut here ]------------

That's one of the startup tests that happens on boot up.

Config attached.

Good news is, it appears very reproducible.

-- Steve

Here's the patch I applied (both folded together):

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d309f30cf7af..2885acd691ac 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -67,9 +67,20 @@
 # define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
 # define preempt_stop(clobbers)
-# define resume_kernel		restore_all_kernel
 #endif
 
+.macro RETINT_PREEMPT
+#ifdef CONFIG_PREEMPT
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	cmpl	$0, PER_CPU_VAR(__preempt_count)
+	jnz	.Lend_\@
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	jz	.Lend_\@
+	call	preempt_schedule_irq
+.Lend_\@:
+#endif
+.endm
+
 .macro TRACE_IRQS_IRET
 #ifdef CONFIG_TRACE_IRQFLAGS
 	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
@@ -753,7 +764,7 @@ ret_from_intr:
 	andl	$SEGMENT_RPL_MASK, %eax
 #endif
 	cmpl	$USER_RPL, %eax
-	jb	resume_kernel			# not returning to v8086 or userspace
+	jb	restore_all_kernel		# not returning to v8086 or userspace
 
 ENTRY(resume_userspace)
 	DISABLE_INTERRUPTS(CLBR_ANY)
@@ -763,19 +774,6 @@ ENTRY(resume_userspace)
 	jmp	restore_all
 END(ret_from_exception)
 
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
-	DISABLE_INTERRUPTS(CLBR_ANY)
-.Lneed_resched:
-	cmpl	$0, PER_CPU_VAR(__preempt_count)
-	jnz	restore_all_kernel
-	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
-	jz	restore_all_kernel
-	call	preempt_schedule_irq
-	jmp	.Lneed_resched
-END(resume_kernel)
-#endif
-
 GLOBAL(__begin_SYSENTER_singlestep_region)
 /*
  * All code from here through __end_SYSENTER_singlestep_region is subject
@@ -1026,6 +1024,7 @@ restore_all:
 	INTERRUPT_RETURN
 
 restore_all_kernel:
+	RETINT_PREEMPT
 	TRACE_IRQS_IRET
 	PARANOID_EXIT_TO_KERNEL_MODE
 	BUG_IF_WRONG_CR3
@@ -1476,6 +1475,94 @@ END(nmi)
 
 ENTRY(int3)
 	ASM_CLAC
+
+#ifdef CONFIG_VM86
+	testl	$X86_EFLAGS_VM, 8(%esp)
+	jnz	.Lfrom_usermode_no_gap
+#endif
+	testl	$SEGMENT_RPL_MASK, 4(%esp)
+	jnz	.Lfrom_usermode_no_gap
+
+	/*
+	 * Here from kernel mode; so the (exception) stack looks like:
+	 *
+	 * 12(esp) - <previous context>
+	 *  8(esp) - flags
+	 *  4(esp) - cs
+	 *  0(esp) - ip
+	 *
+	 * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+	 * is complete and in particular regs->sp is correct. This gives us
+	 * the original 3 enties as gap:
+	 *
+	 * 32(esp) - <previous context>
+	 * 28(esp) - orig_flags / gap
+	 * 24(esp) - orig_cs	/ gap
+	 * 20(esp) - orig_ip	/ gap
+	 * 16(esp) - ss
+	 * 12(esp) - sp
+	 *  8(esp) - flags
+	 *  4(esp) - cs
+	 *  0(esp) - ip
+	 */
+	pushl	%ss	  # ss
+	pushl	%esp      # sp (points at ss)
+	pushl	4*4(%esp) # flags
+	pushl	4*4(%esp) # cs
+	pushl	4*4(%esp) # ip
+
+	add	$16, 12(%esp) # point sp back at the previous context
+
+	pushl	$-1				# orig_eax; mark as interrupt
+
+	SAVE_ALL
+	ENCODE_FRAME_POINTER
+	TRACE_IRQS_OFF
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_int3
+
+	RETINT_PREEMPT
+	TRACE_IRQS_IRET
+	/*
+	 * If we really never INT3 from entry code, it looks like
+	 * we can skip this one.
+	PARANOID_EXIT_TO_KERNEL_MODE
+	 */
+	BUG_IF_WRONG_CR3
+	RESTORE_REGS 4				# consume orig_eax
+
+	/*
+	 * Reconstruct the 3 entry IRET frame right after the (modified)
+	 * regs->sp without lowering %esp in between, such that an NMI in the
+	 * middle doesn't scribble our stack.
+	 */
+
+	pushl	%eax
+	pushl	%ecx
+	movl	5*4(%esp), %eax		# (modified) regs->sp
+
+	movl	4*4(%esp), %ecx		# flags
+	movl	%ecx, -4(%eax)
+
+	movl	3*4(%esp), %ecx		# cs
+	andl	$0x0000ffff, %ecx
+	movl	%ecx, -8(%eax)
+
+	movl	2*4(%esp), %ecx		# ip
+	movl	%ecx, -12(%eax)
+
+	movl	1*4(%esp), %ecx		# eax
+	movl	%ecx, -16(%eax)
+
+	popl	%ecx
+	lea	-16(%eax), %esp
+	popl	%eax
+
+	jmp	.Lirq_return
+
+.Lfrom_usermode_no_gap:
+
 	pushl	$-1				# mark this as an int
 
 	SAVE_ALL switch_stacks=1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..834ec1397dab 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -899,6 +899,16 @@ ENTRY(\sym)
 	jnz	.Lfrom_usermode_switch_stack_\@
 	.endif
 
+	.if \create_gap == 1
+	testb	$3, CS-ORIG_RAX(%rsp)
+	jnz	.Lfrom_usermode_no_gap_\@
+	.rept 6
+	pushq	5*8(%rsp)
+	.endr
+	UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+	.endif
+
 	.if \paranoid
 	call	paranoid_entry
 	.else
@@ -1130,7 +1140,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
 #endif /* CONFIG_HYPERV */
 
 idtentry debug			do_debug		has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3			do_int3			has_error_code=0
+idtentry int3			do_int3			has_error_code=0	create_gap=1
 idtentry stack_segment		do_stack_segment	has_error_code=1
 
 #ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index e85ff65c43c3..ba275b6292db 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -39,4 +39,24 @@ extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
 extern int after_bootmem;
 
+static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+{
+	regs->sp -= sizeof(unsigned long);
+	*(unsigned long *)regs->sp = val;
+}
+
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+	int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
+	int3_emulate_jmp(regs, func);
+}
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..fd152f5a937b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,7 @@
 #include <asm/kprobes.h>
 #include <asm/ftrace.h>
 #include <asm/nops.h>
+#include <asm/text-patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
@@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 }
 
 static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;
 
 static int update_ftrace_func(unsigned long ip, void *new)
 {
@@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	unsigned char *new;
 	int ret;
 
+	ftrace_update_func_call = (unsigned long)func;
+
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = update_ftrace_func(ip, new);
 
@@ -294,13 +298,21 @@ int ftrace_int3_handler(struct pt_regs *regs)
 	if (WARN_ON_ONCE(!regs))
 		return 0;
 
-	ip = regs->ip - 1;
-	if (!ftrace_location(ip) && !is_ftrace_caller(ip))
-		return 0;
+	ip = regs->ip - INT3_INSN_SIZE;
 
-	regs->ip += MCOUNT_INSN_SIZE - 1;
+	if (ftrace_location(ip)) {
+		int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
+		return 1;
+	} else if (is_ftrace_caller(ip)) {
+		if (!ftrace_update_func_call) {
+			int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+			return 1;
+		}
+		int3_emulate_call(regs, ftrace_update_func_call);
+		return 1;
+	}
 
-	return 1;
+	return 0;
 }
 NOKPROBE_SYMBOL(ftrace_int3_handler);
 
@@ -859,6 +871,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 
 	func = ftrace_ops_get_func(ops);
 
+	ftrace_update_func_call = (unsigned long)func;
+
 	/* Do a safe modify in case the trampoline is executing */
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = update_ftrace_func(ip, new);
@@ -960,6 +974,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
 {
 	unsigned char *new;
 
+	ftrace_update_func_call = 0UL;
 	new = ftrace_jmp_replace(ip, (unsigned long)func);
 
 	return update_ftrace_func(ip, new);


Attachment: config.xz
Description: application/xz


[Index of Archives]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Share Photos]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Samba]     [Device Mapper]

  Powered by Linux