[PATCH] Improve o32 syscall handling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello All,

this is a major cleanup for the o32 syscall handling.
For the 32bit kernel, it
 - uses a more efficient syscall table layout, and reduces its size
 - handles stack arguments also more efficiently, and allows for up
   to 8 arguments. This gives an indirect fadvise64_64 syscall a
   chance to work.
 - Fixes several flaws in the indirect syscall path, like duplicated
   user stack handling, and incomplete argument handling.

For the 64bit Kernel, it
 - checks for unaligned user stack
 - also allows now up to 8 arguments
 - removes unused stackhandling cruft from the indirect syscall path
   and does complete argument handling there.


Thiemo


Index: arch/mips/kernel/scall32-o32.S
===================================================================
RCS file: /home/cvs/linux/arch/mips/kernel/scall32-o32.S,v
retrieving revision 1.15
diff -u -p -r1.15 scall32-o32.S
--- arch/mips/kernel/scall32-o32.S	15 Nov 2004 11:49:19 -0000	1.15
+++ arch/mips/kernel/scall32-o32.S	20 Nov 2004 16:46:39 -0000
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 1995, 96, 97, 98, 99, 2000, 01, 02 by Ralf Baechle
  * Copyright (C) 2001 MIPS Technologies, Inc.
+ * Copyright (C) 2004 Thiemo Seufer
  */
 #include <linux/config.h>
 #include <linux/errno.h>
@@ -32,26 +33,30 @@ NESTED(handle_sys, PT_SIZE, sp)
 
 	lw	t1, PT_EPC(sp)		# skip syscall on return
 
+#if defined(CONFIG_BINFMT_IRIX)
 	sltiu	t0, v0, MAX_SYSCALL_NO + 1 # check syscall number
+#else
+	subu	v0, v0, __NR_O32_Linux	# check syscall number
+	sltiu	t0, v0, __NR_O32_Linux_syscalls + 1
+#endif
 	addiu	t1, 4			# skip to next instruction
 	sw	t1, PT_EPC(sp)
 	beqz	t0, illegal_syscall
 
-	/* XXX Put both in one cacheline, should save a bit. */
-	sll	t0, v0, 2
-	lw	t2, sys_call_table(t0)	# syscall routine
-	lbu	t3, sys_narg_table(v0)	# number of arguments
-	beqz	t2, illegal_syscall;
+	sll	t0, v0, 3
+	la	t1, sys_call_table
+	addu	t1, t0
+	lw	t2, (t1)		# syscall routine
+	lw	t3, 4(t1)		# >= 0 if we need stack arguments
+	beqz	t2, illegal_syscall
 
-	subu	t0, t3, 5		# 5 or more arguments?
 	sw	a3, PT_R26(sp)		# save a3 for syscall restarting
-	bgez	t0, stackargs
+	bgez	t3, stackargs
 
 stack_done:
-	sw	a3, PT_R26(sp)          # save for syscall restart
-	LONG_L	t0, TI_FLAGS($28)	# syscall tracing enabled?
+	lw	t0, TI_FLAGS($28)	# syscall tracing enabled?
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
-	and	t0, t1, t0
+	and	t0, t1
 	bnez	t0, syscall_trace_entry	# -> yes
 
 	jalr	t2			# Do The Real Thing (TM)
@@ -70,9 +75,9 @@ o32_syscall_exit:
 	local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
-	LONG_L	a2, TI_FLAGS($28)	# current->work
+	lw	a2, TI_FLAGS($28)	# current->work
 	li	t0, _TIF_ALLWORK_MASK
-	and	t0, a2, t0
+	and	t0, a2
 	bnez	t0, o32_syscall_exit_work
 
 	j	restore_partial
@@ -117,49 +122,50 @@ syscall_trace_entry:
 	 */
 stackargs:
 	lw	t0, PT_R29(sp)		# get old user stack pointer
-	subu	t3, 4
-	sll	t1, t3, 2		# stack valid?
-
-	addu	t1, t0			# end address
-	or	t0, t1
-	bltz	t0, bad_stack		# -> sp is bad
-
-	lw	t0, PT_R29(sp)		# get old user stack pointer
-	PTR_LA	t1, 4f			# copy 1 to 3 arguments
-	sll	t3, t3, 4
-	subu	t1, t3
-	jr	t1
 
-	/* Ok, copy the args from the luser stack to the kernel stack */
 	/*
-	 * I know Ralf doesn't like nops but this avoids code
-	 * duplication for R3000 targets (and this is the
-	 * only place where ".set reorder" doesn't help).
-	 * Harald.
+	 * We intentionally keep the kernel stack a little below the top of
+	 * userspace so we don't have to do a slower byte accurate check here.
 	 */
+	andi	t1, t0, 7
+	lw	t5, TI_ADDR_LIMIT($28)
+	bnez	t1, bad_stack
+	addu	t4, t0, 32
+	and	t5, t4
+	bltz	t5, bad_stack		# -> sp is bad
+
+	/* Ok, copy the args from the luser stack to the kernel stack.
+	 * t3 is the precomputed number of instruction bytes needed to
+	 * load or store arguments 6-8.
+	 */
+
+	la	t1, 5f			# load up to 3 arguments
+	subu	t1, t3
+1:	lw	t5, 16(t0)		# argument #5 from usp
 	.set    push
 	.set    noreorder
 	.set	nomacro
-1:	lw	t1, 24(t0)		# argument #7 from usp
-	nop
-	sw	t1, 24(sp)
-	nop
-2:	lw	t1, 20(t0)		# argument #5 from usp
-	nop
-	sw	t1, 20(sp)
-	nop
-3:	lw	t1, 16(t0)		# argument #5 from usp
-	nop
-	sw	t1, 16(sp)
-	nop
-4:	.set	pop
+	jr	t1
+	 addiu	t1, 6f - 5f
 
-	j	stack_done		# go back
+2:	lw	t8, 28(t0)		# argument #8 from usp
+3:	lw	t7, 24(t0)		# argument #7 from usp
+4:	lw	t6, 20(t0)		# argument #6 from usp
+5:	jr	t1
+	 sw	t5, 16(sp)		# argument #5 to ksp
+
+	sw	t8, 28(sp)		# argument #8 to ksp
+	sw	t7, 24(sp)		# argument #7 to ksp
+	sw	t6, 20(sp)		# argument #6 to ksp
+6:	j	stack_done		# go back
+	 nop
+	.set	pop
 
 	.section __ex_table,"a"
 	PTR	1b,bad_stack
 	PTR	2b,bad_stack
 	PTR	3b,bad_stack
+	PTR	4b,bad_stack
 	.previous
 
 	/*
@@ -239,12 +245,12 @@ illegal_syscall:
 	sw	v0, PT_R2(sp)		# result
 
 	/* Success, so skip usual error handling garbage.  */
-	LONG_L	a2, TI_FLAGS($28)	# syscall tracing enabled?
+	lw	a2, TI_FLAGS($28)	# syscall tracing enabled?
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2, t0
 	bnez	t0, 1f
 
-	b	o32_syscall_exit
+	j	o32_syscall_exit
 
 1:	SAVE_STATIC
 	move	a0, sp
@@ -270,67 +276,47 @@ bad_alignment:
 	END(sys_sysmips)
 
 	LEAF(sys_syscall)
-	lw	t0, PT_R29(sp)			# user sp
-
-	sltu	v0, a0, __NR_O32_Linux + __NR_O32_Linux_syscalls + 1
+#if defined(CONFIG_BINFMT_IRIX)
+	sltiu	v0, a0, MAX_SYSCALL_NO + 1 # check syscall number
+#else
+	subu	v0, a0, __NR_O32_Linux	# check syscall number
+	sltiu	v0, v0, __NR_O32_Linux_syscalls + 1
+#endif
 	beqz	v0, enosys
 
-	sll	v0, a0, 2
-	la	v1, sys_syscall
-	lw	t2, sys_call_table(v0)		# function pointer
-	lbu	t4, sys_narg_table(a0)		# number of arguments
-
-	li	v0, -EINVAL
-	beq	t2, v1, out			# do not recurse
+	sll	t0, v0, 3
+	lw	t2, sys_call_table(t0)		# syscall routine
 
+	li	v1, 4000			# nr of sys_syscall
 	beqz	t2, enosys			# null function pointer?
 
-	andi	v0, t0, 0x3			# unaligned stack pointer?
-	bnez	v0, sigsegv
+	li	v0, -EINVAL
+	beq	a0, v1, out			# do not recurse
 
-	addu	v0, t0, 16			# v0 = usp + 16
-	addu	t1, v0, 12			# 3 32-bit arguments
-	lw	v1, TI_ADDR_LIMIT($28)
-	or	v0, v0, t1
-	and	v1, v1, v0
-	bltz	v1, efault
+	/* Some syscalls like execve get their arguments from struct pt_regs
+	   and claim zero arguments in the syscall table. Thus we have to
+	   assume the worst case and shuffle around all potential arguments.
+	   If you want performance, don't use indirect syscalls. */
 
 	move	a0, a1				# shift argument registers
 	move	a1, a2
 	move	a2, a3
-
-1:	lw	a3, 16(t0)
-2:	lw	t3, 20(t0)
-3:	lw	t4, 24(t0)
-
-	.section	__ex_table, "a"
-	.word	1b, efault
-	.word	2b, efault
-	.word	3b, efault
-	.previous
-
-	sw	t3, 16(sp)			# put into new stackframe
-	sw	t4, 20(sp)
-
-	bnez	t4, 1f				# zero arguments?
-	addu	a0, sp, 32			# then pass sp in a0
-1:
-
-	sw	t3, 16(sp)
-	sw	v1, 20(sp)
+	lw	a3, 16(sp)
+	lw	t4, 20(sp)
+	lw	t5, 24(sp)
+	lw	t6, 28(sp)
+	sw	t4, 16(sp)
+	sw	t5, 20(sp)
+	sw	t6, 24(sp)
+	sw	a0, PT_R4(sp)			# .. and push back a0 - a3, some
+	sw	a1, PT_R5(sp)			# syscalls expect them there
+	sw	a2, PT_R6(sp)
+	sw	a3, PT_R7(sp)
+	sw	a3, PT_R26(sp)			# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
 enosys:	li	v0, -ENOSYS
-	b	out
-
-sigsegv:
-	li	a0, _SIGSEGV
-	move	a1, $28
-	jal	force_sig
-	/* Fall through */
-
-efault:	li	v0, -EFAULT
 
 out:	jr	ra
 	END(sys_syscall)
@@ -350,12 +336,14 @@ out:	jr	ra
 	.endm
 
 	.macro	syscalltable
+#if defined(CONFIG_BINFMT_IRIX)
 	mille	sys_ni_syscall		0	/*    0 -  999 SVR4 flavour */
-	#include "irix5sys.h"			/* 1000 - 1999 32-bit IRIX */
+# include "irix5sys.h"				/* 1000 - 1999 32-bit IRIX */
 	mille	sys_ni_syscall		0	/* 2000 - 2999 BSD43 flavour */
 	mille	sys_ni_syscall		0	/* 3000 - 3999 POSIX flavour */
+#endif
 
-	sys	sys_syscall		0	/* 4000 */
+	sys	sys_syscall		8	/* 4000 */
 	sys	sys_exit		1
 	sys	sys_fork		0
 	sys	sys_read		3
@@ -641,19 +629,16 @@ out:	jr	ra
 
 	.endm
 
+	/* We pre-compute the number of _instruction_ bytes needed to
+	   load or store the arguments 6-8. Negative values are ignored. */
+
 	.macro  sys function, nargs
 	PTR	\function
+	LONG	(\nargs << 2) - (5 << 2)
 	.endm
 
 	.align	3
+	.type	sys_call_table,@object
 sys_call_table:
 	syscalltable
 	.size	sys_call_table, . - sys_call_table
-
-	.macro	sys function, nargs
-	.byte	\nargs
-	.endm
-
-sys_narg_table:
-	syscalltable
-	.size	sys_narg_table, . - sys_narg_table
Index: arch/mips/kernel/scall64-o32.S
===================================================================
RCS file: /home/cvs/linux/arch/mips/kernel/scall64-o32.S,v
retrieving revision 1.22
diff -u -p -r1.22 scall64-o32.S
--- arch/mips/kernel/scall64-o32.S	15 Nov 2004 11:49:19 -0000	1.22
+++ arch/mips/kernel/scall64-o32.S	20 Nov 2004 16:46:39 -0000
@@ -6,6 +6,7 @@
  * Copyright (C) 1995 - 2000, 2001 by Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 2001 MIPS Technologies, Inc.
+ * Copyright (C) 2004 Thiemo Seufer
  *
  * Hairy, the userspace application uses a different argument passing
  * convention than the kernel, so we have to translate things from o32
@@ -43,6 +44,8 @@ NESTED(handle_sys, PT_SIZE, sp)
  RESTORE_ALL
 #endif
 
+	/* We don't want to stumble over broken sign extensions from
+	   userland. O32 does never use the upper half. */
 	sll	a0, a0, 0
 	sll	a1, a1, 0
 	sll	a2, a2, 0
@@ -62,17 +65,21 @@ NESTED(handle_sys, PT_SIZE, sp)
 	 * userspace so we don't have to do a slower byte accurate check here.
 	 */
 	ld	t0, PT_R29(sp)		# get old user stack pointer
+	andi	t3, t0, 7
+	bnez	t3, bad_stack
 	daddu	t1, t0, 32
 	bltz	t1, bad_stack
 
 1:	lw	a4, 16(t0)		# argument #5 from usp
 2:	lw	a5, 20(t0)		# argument #6 from usp
 3:	lw	a6, 24(t0)		# argument #7 from usp
+4:	lw	a7, 28(t0)		# argument #8 from usp (for indirect syscalls)
 
 	.section __ex_table,"a"
 	PTR	1b, bad_stack
 	PTR	2b, bad_stack
 	PTR	3b, bad_stack
+	PTR	4b, bad_stack
 	.previous
 
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
@@ -91,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp)
 	sd	v0, PT_R0(sp)		# flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-FEXPORT(o32_syscall_exit)
+o32_syscall_exit:
 	local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
@@ -109,12 +116,11 @@ o32_syscall_exit_work:
 
 trace_a_syscall:
 	SAVE_STATIC
-	sd	a4, PT_R8(sp)
+	sd	t2, PT_R1(sp)
+	sd	a4, PT_R8(sp)		# Save argument registers
 	sd	a5, PT_R9(sp)
 	sd	a6, PT_R10(sp)
-	sd	a7, PT_R11(sp)
-
-	sd	t2,PT_R1(sp)
+	sd	a7, PT_R11(sp)		# For indirect syscalls
 	move	a0, sp
 	li	a1, 0
 	jal	do_syscall_trace
@@ -126,7 +132,8 @@ trace_a_syscall:
 	ld	a3, PT_R7(sp)
 	ld	a4, PT_R8(sp)
 	ld	a5, PT_R9(sp)
-	ld	a6, PT_R10(sp)		# For indirect syscalls
+	ld	a6, PT_R10(sp)
+	ld	a7, PT_R11(sp)		# For indirect syscalls
 	jalr	t2
 
 	li	t0, -EMAXERRNO - 1	# error?
@@ -174,55 +181,40 @@ illegal_syscall:
 	END(handle_sys)
 
 LEAF(sys32_syscall)
-	ld	t0, PT_R29(sp)		# user sp
-
 	sltu	v0, a0, __NR_O32_Linux + __NR_O32_Linux_syscalls + 1
 	beqz	v0, enosys
 
 	dsll	v0, a0, 3
-	dla	v1, sys32_syscall
 	ld	t2, (sys_call_table - (__NR_O32_Linux * 8))(v0)
 
+	li	v1, 4000		# indirect syscall number
 	li	v0, -EINVAL
-	beq	t2, v1, out		# do not recurse
+	beq	a0, v1, out		# do not recurse
 
 	beqz	t2, enosys		# null function pointer?
 
-	andi	v0, t0, 0x3		# unaligned stack pointer?
-	bnez	v0, sigsegv
-
-	daddiu	v0, t0, 16		# v0 = usp + 16
-	daddu	t1, v0, 12		# 3 32-bit arguments
-	ld	v1, TI_ADDR_LIMIT($28)
-	or	v0, v0, t1
-	and	v1, v1, v0
-	bnez	v1, efault
-
 	move	a0, a1			# shift argument registers
 	move	a1, a2
 	move	a2, a3
 	move	a3, a4
 	move	a4, a5
 	move	a5, a6
+	move	a6, a7
+	sd	a0, PT_R4(sp)		# ... and push back a0 - a3, some
+	sd	a1, PT_R5(sp)		# syscalls expect them there
+	sd	a2, PT_R6(sp)
+	sd	a3, PT_R7(sp)
+	sd	a3, PT_R26(sp)		# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
 enosys:	li	v0, -ENOSYS
-	b	out
-
-sigsegv:
-	li	a0, _SIGSEGV
-	move	a1, $28
-	jal	force_sig
-	/* Fall through */
-
-efault:	li	v0, -EFAULT
 
 out:	jr	ra
 	END(sys32_syscall)
 
 	.align	3
-	.type	sys_call_table,@object;
+	.type	sys_call_table,@object
 sys_call_table:
 	PTR	sys32_syscall			/* 4000 */
 	PTR	sys_exit	


[Index of Archives]     [Linux MIPS Home]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Linux]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux