I found that asm version of csum_partial_copy_from_user() introduced in e9e016815f264227b6260f77ca84f1c43cf8b9bd was less effective. For csum_partial_copy_from_user() case, "both_aligned" 8-word copy/sum loop block is skipped to handle LOAD failure properly, and 4-word copy/sum block is not loop, thus we will loop at ineffective "less_than_4units" block. This patch re-arrange register usages so that t0-t7 can be used in "both_aligned" loop. This makes "both_aligned" loop can be used for copy_from_user case too. This patch also cleanup codes around entry point. Signed-off-by: Atsushi Nemoto <anemo@xxxxxxxxxxxxx> --- arch/mips/lib/csum_partial.S | 74 ++++++++++++++++--------------------------- include/asm-mips/checksum.h | 12 +++--- 2 files changed, 35 insertions(+), 51 deletions(-) diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index ec0744d..26ac0f8 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S @@ -291,8 +291,8 @@ #define dst a1 #define len a2 #define psum a3 #define sum v0 -#define odd t5 -#define errptr t6 +#define odd t8 +#define errptr t9 /* * The exception handler for loads requires that: @@ -376,30 +376,20 @@ #define ADDRMASK (NBYTES-1) .set noat -LEAF(csum_partial_copy_nocheck) - move AT, zero - b __csum_partial_copy - move errptr, zero -FEXPORT(__csum_partial_copy_from_user) - b __csum_partial_copy_user - PTR_ADDU AT, src, len /* See (1) above. */ -FEXPORT(__csum_and_copy_to_user) - move AT, zero -__csum_partial_copy_user: +LEAF(__csum_partial_copy_user) + PTR_ADDU AT, src, len /* See (1) above. */ #ifdef CONFIG_64BIT move errptr, a4 #else lw errptr, 16(sp) #endif -__csum_partial_copy: +FEXPORT(csum_partial_copy_nocheck) move sum, zero move odd, zero /* * Note: dst & src may be unaligned, len may be 0 * Temps */ -#define rem t8 - /* * The "issue break"s below are very approximate. * Issue delays for dcache fills will perturb the schedule, as will @@ -422,51 +412,45 @@ #define rem t8 both_aligned: SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter beqz t0, cleanup_both_aligned # len < 8*NBYTES - and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) - /* - * We can not do this loop if LOAD might fail, otherwize - * l_exc_copy can not calclate sum correctly. - * AT==0 means LOAD should not fail. - */ - bnez AT, cleanup_both_aligned nop + SUB len, 8*NBYTES # subtract here for bgez loop .align 4 1: - LOAD t0, UNIT(0)(src) - LOAD t1, UNIT(1)(src) - LOAD t2, UNIT(2)(src) - LOAD t3, UNIT(3)(src) +EXC( LOAD t0, UNIT(0)(src), l_exc) +EXC( LOAD t1, UNIT(1)(src), l_exc_copy) +EXC( LOAD t2, UNIT(2)(src), l_exc_copy) +EXC( LOAD t3, UNIT(3)(src), l_exc_copy) +EXC( LOAD t4, UNIT(4)(src), l_exc_copy) +EXC( LOAD t5, UNIT(5)(src), l_exc_copy) +EXC( LOAD t6, UNIT(6)(src), l_exc_copy) +EXC( LOAD t7, UNIT(7)(src), l_exc_copy) SUB len, len, 8*NBYTES - LOAD t4, UNIT(4)(src) - LOAD t7, UNIT(5)(src) + ADD src, src, 8*NBYTES EXC( STORE t0, UNIT(0)(dst), s_exc) ADDC(sum, t0) EXC( STORE t1, UNIT(1)(dst), s_exc) ADDC(sum, t1) - LOAD t0, UNIT(6)(src) - LOAD t1, UNIT(7)(src) - ADD src, src, 8*NBYTES - ADD dst, dst, 8*NBYTES -EXC( STORE t2, UNIT(-6)(dst), s_exc) +EXC( STORE t2, UNIT(2)(dst), s_exc) ADDC(sum, t2) -EXC( STORE t3, UNIT(-5)(dst), s_exc) +EXC( STORE t3, UNIT(3)(dst), s_exc) ADDC(sum, t3) -EXC( STORE t4, UNIT(-4)(dst), s_exc) +EXC( STORE t4, UNIT(4)(dst), s_exc) ADDC(sum, t4) -EXC( STORE t7, UNIT(-3)(dst), s_exc) +EXC( STORE t5, UNIT(5)(dst), s_exc) + ADDC(sum, t5) +EXC( STORE t6, UNIT(6)(dst), s_exc) + ADDC(sum, t6) +EXC( STORE t7, UNIT(7)(dst), s_exc) ADDC(sum, t7) -EXC( STORE t0, UNIT(-2)(dst), s_exc) - ADDC(sum, t0) -EXC( STORE t1, UNIT(-1)(dst), s_exc) - .set reorder - ADDC(sum, t1) - bne len, rem, 1b - .set noreorder + bgez len, 1b + ADD dst, dst, 8*NBYTES + ADD len, 8*NBYTES # revert len (see above) /* - * len == rem == the number of bytes left to copy < 8*NBYTES + * len == the number of bytes left to copy < 8*NBYTES */ cleanup_both_aligned: +#define rem t7 beqz len, done sltu t0, len, 4*NBYTES bnez t0, less_than_4units @@ -729,4 +713,4 @@ s_exc: li v1, -EFAULT jr ra sw v1, (errptr) - END(csum_partial_copy_nocheck) + END(__csum_partial_copy_user) diff --git a/include/asm-mips/checksum.h b/include/asm-mips/checksum.h index 6596fe6..84b0ace 100644 --- a/include/asm-mips/checksum.h +++ b/include/asm-mips/checksum.h @@ -29,10 +29,8 @@ #include <asm/uaccess.h> */ __wsum csum_partial(const void *buff, int len, __wsum sum); -__wsum __csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, int *err_ptr); -__wsum __csum_and_copy_to_user(const void *src, void __user *dst, - int len, __wsum sum, int *err_ptr); +__wsum __csum_partial_copy_user(const void __user *src, void __user *dst, + int len, __wsum sum, int *err_ptr); /* * this is a new version of the above that records errors it finds in *errp, @@ -43,7 +41,8 @@ __wsum csum_partial_copy_from_user(const __wsum sum, int *err_ptr) { might_sleep(); - return __csum_partial_copy_from_user(src, dst, len, sum, err_ptr); + return __csum_partial_copy_user(src, (void __user *)dst, + len, sum, err_ptr); } /* @@ -56,7 +55,8 @@ __wsum csum_and_copy_to_user(const void { might_sleep(); if (access_ok(VERIFY_WRITE, dst, len)) - return __csum_and_copy_to_user(src, dst, len, sum, err_ptr); + return __csum_partial_copy_user((const void __user *)src, dst, + len, sum, err_ptr); if (len) *err_ptr = -EFAULT;