Re: [PATCH] MIPS: lib: Optimize partial checksum ops using prefetching.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 01/21/2014 08:18 AM, Steven J. Hill wrote:
From: Leonid Yegoshin <Leonid.Yegoshin@xxxxxxxxxx>

Use the PREF instruction to optimize partial checksum operations.

Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@xxxxxxxxxx>
Signed-off-by: Steven J. Hill <Steven.Hill@xxxxxxxxxx>

NACK. The proper latench and cacheline stride vary by CPU, you cannot just hard code them for 32-byte cacheline size with some random latency.

This will make some CPUs slower.

---
  arch/mips/lib/csum_partial.S | 12 ++++++++++++
  1 file changed, 12 insertions(+)

diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index a6adffb..272820e 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -417,13 +417,19 @@ FEXPORT(csum_partial_copy_nocheck)
  	 *
  	 * If len < NBYTES use byte operations.
  	 */
+	PREF(	0, 0(src))
+	PREF(	1, 0(dst))
  	sltu	t2, len, NBYTES
  	and	t1, dst, ADDRMASK
  	bnez	t2, .Lcopy_bytes_checklen
+	PREF(	0, 32(src))
+	PREF(	1, 32(dst))
  	 and	t0, src, ADDRMASK
  	andi	odd, dst, 0x1			/* odd buffer? */
  	bnez	t1, .Ldst_unaligned
  	 nop
+	PREF(	0, 2*32(src))
+	PREF(	1, 2*32(dst))
  	bnez	t0, .Lsrc_unaligned_dst_aligned
  	/*
  	 * use delay slot for fall-through
@@ -434,6 +440,8 @@ FEXPORT(csum_partial_copy_nocheck)
  	beqz	t0, .Lcleanup_both_aligned # len < 8*NBYTES
  	 nop
  	SUB	len, 8*NBYTES		# subtract here for bgez loop
+	PREF(	0, 3*32(src))
+	PREF(	1, 3*32(dst))
  	.align	4
  1:
  EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
@@ -464,6 +472,8 @@ EXC(	STORE	t7, UNIT(7)(dst),	.Ls_exc)
  	ADDC(sum, t7)
  	.set	reorder				/* DADDI_WAR */
  	ADD	dst, dst, 8*NBYTES
+	PREF(	0, 8*32(src))
+	PREF(	1, 8*32(dst))
  	bgez	len, 1b
  	.set	noreorder
  	ADD	len, 8*NBYTES		# revert len (see above)
@@ -569,8 +579,10 @@ EXC(	STFIRST t3, FIRST(0)(dst),	.Ls_exc)

  .Lsrc_unaligned_dst_aligned:
  	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
+	PREF(	0, 3*32(src))
  	beqz	t0, .Lcleanup_src_unaligned
  	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
+	PREF(	1, 3*32(dst))
  1:
  /*
   * Avoid consecutive LD*'s to the same register since some mips




[Index of Archives]     [Linux MIPS Home]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Linux]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux