On 01/21/2014 08:18 AM, Steven J. Hill wrote:
From: Leonid Yegoshin <Leonid.Yegoshin@xxxxxxxxxx> Use the PREF instruction to optimize partial checksum operations. Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@xxxxxxxxxx> Signed-off-by: Steven J. Hill <Steven.Hill@xxxxxxxxxx>
NACK. The proper latench and cacheline stride vary by CPU, you cannot just hard code them for 32-byte cacheline size with some random latency.
This will make some CPUs slower.
--- arch/mips/lib/csum_partial.S | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index a6adffb..272820e 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S @@ -417,13 +417,19 @@ FEXPORT(csum_partial_copy_nocheck) * * If len < NBYTES use byte operations. */ + PREF( 0, 0(src)) + PREF( 1, 0(dst)) sltu t2, len, NBYTES and t1, dst, ADDRMASK bnez t2, .Lcopy_bytes_checklen + PREF( 0, 32(src)) + PREF( 1, 32(dst)) and t0, src, ADDRMASK andi odd, dst, 0x1 /* odd buffer? */ bnez t1, .Ldst_unaligned nop + PREF( 0, 2*32(src)) + PREF( 1, 2*32(dst)) bnez t0, .Lsrc_unaligned_dst_aligned /* * use delay slot for fall-through @@ -434,6 +440,8 @@ FEXPORT(csum_partial_copy_nocheck) beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES nop SUB len, 8*NBYTES # subtract here for bgez loop + PREF( 0, 3*32(src)) + PREF( 1, 3*32(dst)) .align 4 1: EXC( LOAD t0, UNIT(0)(src), .Ll_exc) @@ -464,6 +472,8 @@ EXC( STORE t7, UNIT(7)(dst), .Ls_exc) ADDC(sum, t7) .set reorder /* DADDI_WAR */ ADD dst, dst, 8*NBYTES + PREF( 0, 8*32(src)) + PREF( 1, 8*32(dst)) bgez len, 1b .set noreorder ADD len, 8*NBYTES # revert len (see above) @@ -569,8 +579,10 @@ EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc) .Lsrc_unaligned_dst_aligned: SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter + PREF( 0, 3*32(src)) beqz t0, .Lcleanup_src_unaligned and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES + PREF( 1, 3*32(dst)) 1: /* * Avoid consecutive LD*'s to the same register since some mips