Re: [PATCH v2] Add iwmmxt optimization for sbc for pxa series cpu

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> Did you run some benchmarks with these optimizations to measure how much they
> are helping?
Tested on Marvell PXA platform.
== Before ==
$ time ./sbcenc   -b53 -s8 -j  c.au  > /dev/null
real    0m 0.41s
user    0m 0.40s
sys     0m 0.00s

== After ==
$ time ./sbcenc   -b53 -s8 -j  c.au  > /dev/null
real    0m 0.19s
user    0m 0.17s
sys     0m 0.02s

> Using back-to-back WLDRD instructions has some performance penalty
I rearrange the instructions and keep the original one as for reference in
the block that comment out. Since the code is really difficult to read
after interleaved.

> The MMX code was using PCMPGTD and the other instructions just because MMX
> instruction set is very limited and did not have the needed instructions. But
> you can use WABS and WMAX instructions to do this job better. You can refer to
> the original C code and also to ARM NEON optimizations to get some ideas about
> how to do this operation faster.
Changed as suggested.
But got a question that the __IWMMXT__ builtin gcc definition is not a
reliable way to
determine whether mcpu=iwmmxt2 is turned on or not. It will break when
compile under pxa270
which does not support wabs with just mcpu=iwmmx on.

Keith

Signed-off-by: Keith Mok <ek9852@xxxxxxxxx>
---
diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
 sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
 			sbc/sbc_primitives.h sbc/sbc_primitives.c \
 			sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+			sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
 			sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
 			sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c

diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@

 #include "sbc_primitives.h"
 #include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
 #include "sbc_primitives_neon.h"
 #include "sbc_primitives_armv6.h"

@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
 	sbc_init_primitives_armv6(state);
 #endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+	sbc_init_primitives_iwmmxt(state);
+#endif
 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
 	sbc_init_primitives_neon(state);
 #endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..b988bb1
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,599 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2010 Keith Mok <ek9852@xxxxxxxxx>
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+					const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		"wldrd        wr6, [%0, #16]\n"
+		"wmadds       wr1, wr3, wr1\n"
+		"wldrd        wr7, [%0, #24]\n"
+		"waddwss      wr0, wr0, wr4\n"
+		"wldrd        wr8, [%1, #16]\n"
+		"waddwss      wr1, wr1, wr4\n"
+		"wldrd        wr9, [%1, #24]\n"
+		"wmadds       wr6, wr8, wr6\n"
+		"wldrd        wr2, [%0, #32]\n"
+		"wmadds       wr7, wr9, wr7\n"
+		"wldrd        wr3, [%0, #40]\n"
+		"waddwss      wr0, wr6, wr0\n"
+		"wldrd        wr4, [%1, #32]\n"
+		"waddwss      wr1, wr7, wr1\n"
+		"wldrd        wr5, [%1, #40]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"wldrd        wr8, [%1, #48]\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"wldrd        wr9, [%1, #56]\n"
+		"wmadds       wr6, wr8, wr6\n"
+		"wldrd        wr2, [%0, #64]\n"
+		"wmadds       wr7, wr9, wr7\n"
+		"wldrd        wr3, [%0, #72]\n"
+		"waddwss      wr0, wr6, wr0\n"
+		"wldrd        wr4, [%1, #64]\n"
+		"waddwss      wr1, wr7, wr1\n"
+		"wldrd        wr5, [%1, #72]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wldrd        wr4, [%1, #80]\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wldrd        wr5, [%1, #88]\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wldrd        wr6, [%1, #96]\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wldrd        wr7, [%1, #104]\n"
+		"wmadds       wr2, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wmadds       wr3, wr7, wr1\n"
+		"wmadds       wr1, wr6, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr2, wr3, wr2\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wcgr0", "memory");
+#if 0
+	/* without pipeline and resultant latency consideration
+	 * keep it here for reference
+	 * since the latency optimizated code above is difficult to read */
+	asm volatile (
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr0, [%0]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		"wmadds       wr1, wr3, wr1\n"
+		"waddwss      wr0, wr0, wr4\n"
+		"waddwss      wr1, wr1, wr4\n"
+		"\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1, #16]\n"
+		"wldrd        wr5, [%1, #24]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #32]\n"
+		"wldrd        wr3, [%0, #40]\n"
+		"wldrd        wr4, [%1, #32]\n"
+		"wldrd        wr5, [%1, #40]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #48]\n"
+		"wldrd        wr3, [%0, #56]\n"
+		"wldrd        wr4, [%1, #48]\n"
+		"wldrd        wr5, [%1, #56]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #64]\n"
+		"wldrd        wr3, [%0, #72]\n"
+		"wldrd        wr4, [%1, #64]\n"
+		"wldrd        wr5, [%1, #72]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"\n"
+		"wldrd        wr4, [%1, #80]\n"
+		"wldrd        wr5, [%1, #88]\n"
+		"wldrd        wr6, [%1, #96]\n"
+		"wldrd        wr7, [%1, #104]\n"
+		"wmadds       wr2, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wmadds       wr3, wr7, wr1\n"
+		"wmadds       wr1, wr6, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr2, wr3, wr2\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "memory");
+#endif
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+							const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr15, %2\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		"wldrd        wr8, [%1, #32]\n"
+		"wmadds       wr1, wr1, wr5\n"
+		"wldrd        wr9, [%1, #40]\n"
+		"wmadds       wr2, wr2, wr6\n"
+		"wldrd       wr10, [%1, #48]\n"
+		"wmadds       wr3, wr3, wr7\n"
+		"wldrd       wr11, [%1, #56]\n"
+		"waddwss      wr0, wr0, wr15\n"
+		"wldrd        wr4, [%0, #32]\n"
+		"waddwss      wr1, wr1, wr15\n"
+		"wldrd        wr5, [%0, #40]\n"
+		"waddwss      wr2, wr2, wr15\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"waddwss      wr3, wr3, wr15\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wldrd       wr12, [%0, #64]\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wldrd       wr13, [%0, #72]\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wldrd       wr14, [%0, #80]\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"wldrd       wr15, [%0, #88]\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"wldrd        wr8, [%1, #64]\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"wldrd        wr9, [%1, #72]\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"wldrd       wr10, [%1, #80]\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"wldrd       wr11, [%1, #88]\n"
+		"wmadds      wr12, wr12, wr8\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"wmadds      wr13, wr13, wr9\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"wmadds      wr14, wr14, wr10\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"wmadds      wr15, wr15, wr11\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"waddwss      wr0, wr12, wr0\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"waddwss      wr1, wr13, wr1\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"waddwss      wr2, wr14, wr2\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"waddwss      wr3, wr15, wr3\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wldrd       wr12, [%0, #128]\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wldrd       wr13, [%0, #136]\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wldrd       wr14, [%0, #144]\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"wldrd       wr15, [%0, #152]\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"wldrd        wr8, [%1, #128]\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"wldrd        wr9, [%1, #136]\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"wldrd       wr10, [%1, #144]\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"wldrd       wr11, [%1, #152]\n"
+		"wmadds      wr12, wr12, wr8\n"
+		"wmadds      wr13, wr13, wr9\n"
+		"wmadds      wr14, wr14, wr10\n"
+		"wmadds      wr15, wr15, wr11\n"
+		"waddwss      wr0, wr12, wr0\n"
+		"waddwss      wr1, wr13, wr1\n"
+		"waddwss      wr2, wr14, wr2\n"
+		"waddwss      wr3, wr15, wr3\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wldrd        wr4, [%1, #160]\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		"wldrd        wr5, [%1, #168]\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"wldrd        wr6, [%1, #192]\n"
+		"wmadds       wr4, wr4, wr0\n"
+		"wldrd        wr7, [%1, #200]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wldrd        wr8, [%1, #224]\n"
+		"wmadds       wr6, wr6, wr1\n"
+		"wldrd        wr9, [%1, #232]\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"wmadds       wr8, wr8, wr2\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"wmadds       wr9, wr9, wr2\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"waddwss      wr4, wr8, wr4\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr6, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wldrd        wr8, [%1, #208]\n"
+		"wmadds       wr0, wr6, wr0\n"
+		"wldrd        wr9, [%1, #216]\n"
+		"wmadds       wr9, wr9, wr1\n"
+		"wldrd        wr6, [%1, #240]\n"
+		"wmadds       wr1, wr8, wr1\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wldrd        wr8, [%1, #272]\n"
+		"wmadds       wr2, wr6, wr2\n"
+		"wldrd        wr9, [%1, #280]\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"wmadds       wr9, wr9, wr3\n"
+		"wmadds       wr3, wr8, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+		  "wcgr0", "memory");
+#if 0
+	/* without pipeline and resultant latency consideration
+	 * keep it here for reference
+	 * since the latency optimizated code above is difficult to read */
+	asm volatile (
+		"tbcstw       wr8, %2\n"
+		"wldrd        wr0, [%0]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		"wmadds       wr1, wr1, wr5\n"
+		"wmadds       wr2, wr2, wr6\n"
+		"wmadds       wr3, wr3, wr7\n"
+		"waddwss      wr0, wr0, wr8\n"
+		"waddwss      wr1, wr1, wr8\n"
+		"waddwss      wr2, wr2, wr8\n"
+		"waddwss      wr3, wr3, wr8\n"
+		"\n"
+		"wldrd        wr4, [%0, #32]\n"
+		"wldrd        wr5, [%0, #40]\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"wldrd        wr8, [%1, #32]\n"
+		"wldrd        wr9, [%1, #40]\n"
+		"wldrd       wr10, [%1, #48]\n"
+		"wldrd       wr11, [%1, #56]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #64]\n"
+		"wldrd        wr5, [%0, #72]\n"
+		"wldrd        wr6, [%0, #80]\n"
+		"wldrd        wr7, [%0, #88]\n"
+		"wldrd        wr8, [%1, #64]\n"
+		"wldrd        wr9, [%1, #72]\n"
+		"wldrd       wr10, [%1, #80]\n"
+		"wldrd       wr11, [%1, #88]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #128]\n"
+		"wldrd        wr5, [%0, #136]\n"
+		"wldrd        wr6, [%0, #144]\n"
+		"wldrd        wr7, [%0, #152]\n"
+		"wldrd        wr8, [%1, #128]\n"
+		"wldrd        wr9, [%1, #136]\n"
+		"wldrd       wr10, [%1, #144]\n"
+		"wldrd       wr11, [%1, #152]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"\n"
+		"wldrd        wr4, [%1, #160]\n"
+		"wldrd        wr5, [%1, #168]\n"
+		"wmadds       wr4, wr4, wr0\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"\n"
+		"wldrd        wr6, [%1, #192]\n"
+		"wldrd        wr7, [%1, #200]\n"
+		"wmadds       wr6, wr6, wr1\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr6, [%1, #224]\n"
+		"wldrd        wr7, [%1, #232]\n"
+		"wmadds       wr6, wr6, wr2\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr4, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wldrd        wr4, [%1, #208]\n"
+		"wldrd        wr7, [%1, #216]\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"wmadds       wr1, wr4, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr4, [%1, #240]\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr4, [%1, #272]\n"
+		"wldrd        wr7, [%1, #280]\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"wmadds       wr3, wr4, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "memory");
+#endif
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+static void sbc_calc_scalefactors_iwmmxt2(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	int ch, sb;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb += 2) {
+			int blk = blocks;
+			int32_t *in = &sb_sample_f[0][ch][sb];
+			/* For iwmmxt2, since we use wabs */
+			asm volatile (
+				"wldrd        wr1, [%[in]], %[inc]\n"
+				"tbcstw       wr0, %[c1]\n"
+				"wldrd        wr2, [%[in]], %[inc]\n"
+				"wldrd        wr3, [%[in]], %[inc]\n"
+				"wldrd        wr4, [%[in]], %[inc]\n"
+			"1:\n"
+				"wabsw        wr1, wr1\n"
+				"wabsw        wr2, wr2\n"
+				"wabsw        wr3, wr3\n"
+				"wabsw        wr4, wr4\n"
+				"wmaxuw       wr5, wr1, wr2\n"
+				"wldrd        wr1, [%[in]], %[inc]\n"
+				"wmaxuw       wr6, wr3, wr4\n"
+				"wldrd        wr2, [%[in]], %[inc]\n"
+				"wmaxuw       wr5, wr5, wr6\n"
+				"wldrd        wr3, [%[in]], %[inc]\n"
+				"wmaxuw       wr0, wr0, wr5\n"
+				"wldrd        wr4, [%[in]], %[inc]\n"
+				"subs         %[blk], %[blk], #4\n"
+				"bgt          1b\n"
+
+				"tmrrc        %0, %1, wr0\n"
+				"sub          %0, %0, #1\n"
+				"clz          %0, %0\n"
+				"rsb          %0, %0, %[c2]\n"
+				"str          %0, [%[out]]\n"
+
+				"sub          %1, %1, #1\n"
+				"clz          %1, %1\n"
+				"rsb          %1, %1, %[c2]\n"
+				"str          %1, [%[out], #4]\n"
+			: [in] "+r" (in), [blk] "+r" (blk)
+			: [inc] "i" ((char *) &sb_sample_f[1][0][0] -
+					(char *) &sb_sample_f[0][0][0]),
+				[out] "r" (&scale_factor[ch][sb]),
+				[c1] "r" ((1 << SCALE_OUT_BITS) + 1),
+				[c2] "i" (SCALE_OUT_BITS+1)
+			: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6",
+			  "cc", "memory");
+		}
+	}
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+	state->sbc_calc_scalefactors = sbc_calc_scalefactors_iwmmxt2;
+	state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+		!defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
--
To unsubscribe from this list: send the line "unsubscribe linux-bluetooth" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Bluez Devel]     [Linux Wireless Networking]     [Linux Wireless Personal Area Networking]     [Linux ATH6KL]     [Linux USB Devel]     [Linux Media Drivers]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [Big List of Linux Books]

  Powered by Linux