[PATCH 1/3] Fix csum_partial_copy_from_user (take 2)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I found that asm version of csum_partial_copy_from_user() introduced
in e9e016815f264227b6260f77ca84f1c43cf8b9bd was less effective.

For csum_partial_copy_from_user() case, "both_aligned" 8-word copy/sum
loop block is skipped to handle LOAD failure properly, and 4-word
copy/sum block is not loop, thus we will loop at ineffective
"less_than_4units" block.

This patch re-arrange register usages so that t0-t7 can be used in
"both_aligned" loop.  This makes "both_aligned" loop can be used for
copy_from_user case too.  This patch also cleanup codes around entry
point.

Signed-off-by: Atsushi Nemoto <anemo@xxxxxxxxxxxxx>
---
 arch/mips/lib/csum_partial.S |   74 ++++++++++++++++---------------------------
 include/asm-mips/checksum.h  |   12 +++---
 2 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index ec0744d..26ac0f8 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -291,8 +291,8 @@ #define dst a1
 #define len a2
 #define psum a3
 #define sum v0
-#define odd t5
-#define errptr t6
+#define odd t8
+#define errptr t9
 
 /*
  * The exception handler for loads requires that:
@@ -376,30 +376,20 @@ #define ADDRMASK (NBYTES-1)
 
 	.set	noat
 
-LEAF(csum_partial_copy_nocheck)
-	move	AT, zero
-	b	__csum_partial_copy
-	 move	errptr, zero
-FEXPORT(__csum_partial_copy_from_user)
-	b	__csum_partial_copy_user
-	 PTR_ADDU	AT, src, len	/* See (1) above. */
-FEXPORT(__csum_and_copy_to_user)
-	move	AT, zero
-__csum_partial_copy_user:
+LEAF(__csum_partial_copy_user)
+	PTR_ADDU	AT, src, len	/* See (1) above. */
 #ifdef CONFIG_64BIT
 	move	errptr, a4
 #else
 	lw	errptr, 16(sp)
 #endif
-__csum_partial_copy:
+FEXPORT(csum_partial_copy_nocheck)
 	move	sum, zero
 	move	odd, zero
 	/*
 	 * Note: dst & src may be unaligned, len may be 0
 	 * Temps
 	 */
-#define rem t8
-
 	/*
 	 * The "issue break"s below are very approximate.
 	 * Issue delays for dcache fills will perturb the schedule, as will
@@ -422,51 +412,45 @@ #define rem t8
 both_aligned:
 	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 	beqz	t0, cleanup_both_aligned # len < 8*NBYTES
-	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
-	/*
-	 * We can not do this loop if LOAD might fail, otherwize
-	 * l_exc_copy can not calclate sum correctly.
-	 * AT==0 means LOAD should not fail.
-	 */
-	bnez	AT, cleanup_both_aligned
 	 nop
+	SUB	len, 8*NBYTES		# subtract here for bgez loop
 	.align	4
 1:
-	LOAD	t0, UNIT(0)(src)
-	LOAD	t1, UNIT(1)(src)
-	LOAD	t2, UNIT(2)(src)
-	LOAD	t3, UNIT(3)(src)
+EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
+EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
+EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
+EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
+EXC(	LOAD	t4, UNIT(4)(src),	l_exc_copy)
+EXC(	LOAD	t5, UNIT(5)(src),	l_exc_copy)
+EXC(	LOAD	t6, UNIT(6)(src),	l_exc_copy)
+EXC(	LOAD	t7, UNIT(7)(src),	l_exc_copy)
 	SUB	len, len, 8*NBYTES
-	LOAD	t4, UNIT(4)(src)
-	LOAD	t7, UNIT(5)(src)
+	ADD	src, src, 8*NBYTES
 EXC(	STORE	t0, UNIT(0)(dst),	s_exc)
 	ADDC(sum, t0)
 EXC(	STORE	t1, UNIT(1)(dst),	s_exc)
 	ADDC(sum, t1)
-	LOAD	t0, UNIT(6)(src)
-	LOAD	t1, UNIT(7)(src)
-	ADD	src, src, 8*NBYTES
-	ADD	dst, dst, 8*NBYTES
-EXC(	STORE	t2, UNIT(-6)(dst),	s_exc)
+EXC(	STORE	t2, UNIT(2)(dst),	s_exc)
 	ADDC(sum, t2)
-EXC(	STORE	t3, UNIT(-5)(dst),	s_exc)
+EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
 	ADDC(sum, t3)
-EXC(	STORE	t4, UNIT(-4)(dst),	s_exc)
+EXC(	STORE	t4, UNIT(4)(dst),	s_exc)
 	ADDC(sum, t4)
-EXC(	STORE	t7, UNIT(-3)(dst),	s_exc)
+EXC(	STORE	t5, UNIT(5)(dst),	s_exc)
+	ADDC(sum, t5)
+EXC(	STORE	t6, UNIT(6)(dst),	s_exc)
+	ADDC(sum, t6)
+EXC(	STORE	t7, UNIT(7)(dst),	s_exc)
 	ADDC(sum, t7)
-EXC(	STORE	t0, UNIT(-2)(dst),	s_exc)
-	ADDC(sum, t0)
-EXC(	STORE	t1, UNIT(-1)(dst),	s_exc)
-	.set reorder
-	ADDC(sum, t1)
-	bne	len, rem, 1b
-	.set noreorder
+	bgez	len, 1b
+	 ADD	dst, dst, 8*NBYTES
+	ADD	len, 8*NBYTES		# revert len (see above)
 
 	/*
-	 * len == rem == the number of bytes left to copy < 8*NBYTES
+	 * len == the number of bytes left to copy < 8*NBYTES
 	 */
 cleanup_both_aligned:
+#define rem t7
 	beqz	len, done
 	 sltu	t0, len, 4*NBYTES
 	bnez	t0, less_than_4units
@@ -729,4 +713,4 @@ s_exc:
 	li	v1, -EFAULT
 	jr	ra
 	 sw	v1, (errptr)
-	END(csum_partial_copy_nocheck)
+	END(__csum_partial_copy_user)
diff --git a/include/asm-mips/checksum.h b/include/asm-mips/checksum.h
index 6596fe6..84b0ace 100644
--- a/include/asm-mips/checksum.h
+++ b/include/asm-mips/checksum.h
@@ -29,10 +29,8 @@ #include <asm/uaccess.h>
  */
 __wsum csum_partial(const void *buff, int len, __wsum sum);
 
-__wsum __csum_partial_copy_from_user(const void __user *src, void *dst,
-				     int len, __wsum sum, int *err_ptr);
-__wsum __csum_and_copy_to_user(const void *src, void __user *dst,
-			       int len, __wsum sum, int *err_ptr);
+__wsum __csum_partial_copy_user(const void __user *src, void __user *dst,
+				int len, __wsum sum, int *err_ptr);
 
 /*
  * this is a new version of the above that records errors it finds in *errp,
@@ -43,7 +41,8 @@ __wsum csum_partial_copy_from_user(const
 				   __wsum sum, int *err_ptr)
 {
 	might_sleep();
-	return __csum_partial_copy_from_user(src, dst, len, sum, err_ptr);
+	return __csum_partial_copy_user(src, (void __user *)dst,
+					len, sum, err_ptr);
 }
 
 /*
@@ -56,7 +55,8 @@ __wsum csum_and_copy_to_user(const void 
 {
 	might_sleep();
 	if (access_ok(VERIFY_WRITE, dst, len))
-		return __csum_and_copy_to_user(src, dst, len, sum, err_ptr);
+		return __csum_partial_copy_user((const void __user *)src, dst,
+						len, sum, err_ptr);
 	if (len)
 		*err_ptr = -EFAULT;
 


[Index of Archives]     [Linux MIPS Home]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Linux]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux