[PATCH] MIPS: lib: csum_partial: more instruction paral

chenj <chenj@xxxxxxxxxx> · Tue, 20 May 2014 20:33:40 +0800

Computing sum introduces true data dependency. This patch removes some
true data depdendencies, hence instruction level parallelism is
improved.

This patch brings at most 50% csum performance gain on Loongson 3a
processor in our test.

One example about how this patch works is in CSUM_BIGCHUNK1:
// ** original **    vs    ** patch applied **
    ADDC(sum, t0)           ADDC(t0, t1)
    ADDC(sum, t1)           ADDC(t2, t3)
    ADDC(sum, t2)           ADDC(sum, t0)
    ADDC(sum, t3)           ADDC(sum, t2)

In the original implementation, each ADDC(sum, ...) references the sum
value updated by previous ADDC.

With patch applied, the first two ADDC operations are independent,
hence can be executed simultaneously if possible.

Another example is in the "copy and sum calculating chunk":
// ** original **    vs    ** patch applied **
    STORE(t0, UNIT(0) ...   STORE(t0, UNIT(0) ...
    ADDC(sum, t0)           ADDC(t0, t1)
    STORE(t1, UNIT(1) ...   STORE(t1, UNIT(1) ...
    ADDC(sum, t1)           ADDC(sum, t0)
    STORE(t2, UNIT(2) ...   STORE(t2, UNIT(2) ...
    ADDC(sum, t2)           ADDC(t2, t3)
    STORE(t3, UNIT(3) ...   STORE(t3, UNIT(3) ...
    ADDC(sum, t3)           ADDC(sum, t2)

With patch applied, the first and third ADDC are independent.

Signed-off-by: chenj <chenj@xxxxxxxxxx>
---
1. The result can be found at
http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.html
And is generated by a userspace test program:
http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz

[v2: amend commit message]
[v3: further amend commit message]
[v4: amend commit message & sign-off my patch]

 arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 9901237..6cea101 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -76,10 +76,10 @@
 	LOAD	_t1, (offset + UNIT(1))(src);			\
 	LOAD	_t2, (offset + UNIT(2))(src);			\
 	LOAD	_t3, (offset + UNIT(3))(src);			\
+	ADDC(_t0, _t1);						\
+	ADDC(_t2, _t3);						\
 	ADDC(sum, _t0);						\
-	ADDC(sum, _t1);						\
-	ADDC(sum, _t2);						\
-	ADDC(sum, _t3)
+	ADDC(sum, _t2)
 
 #ifdef USE_DOUBLE
 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
@@ -501,21 +501,21 @@ LEAF(csum_partial)
 	SUB	len, len, 8*NBYTES
 	ADD	src, src, 8*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
-	ADDC(sum, t4)
+	ADDC(t4, t5)
 	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
-	ADDC(sum, t5)
+	ADDC(sum, t4)
 	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
-	ADDC(sum, t6)
+	ADDC(t6, t7)
 	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
-	ADDC(sum, t7)
+	ADDC(sum, t6)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 8*NBYTES
 	bgez	len, 1b
@@ -541,13 +541,13 @@ LEAF(csum_partial)
 	SUB	len, len, 4*NBYTES
 	ADD	src, src, 4*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	beqz	len, .Ldone\@
@@ -646,13 +646,13 @@ LEAF(csum_partial)
 	nop				# improves slotting
 #endif
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
-- 
1.9.0