[PATCH v2 3/6] core: add ARM NEON optimized sample conversion code

pmeerw@xxxxxxxxxx (Peter Meerwald) · Wed, 22 Feb 2012 12:45:04 +0100

From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx>

v2: 
* load and store data with vld1/vld1q and vst1/vst1a, resp., to work
  around alignment issues of compiler-generated vldmia instruction
* remove redundant check for NEON flags

Ubuntu/Linaro gcc 4.6.1
arm-linux-gnueabi-gcc-O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon

runtime on beagle-xm, 800 MHz

checking NEON sconv_s16le_from_float(1020)
NEON: 3510 usec.
ref: 60731 usec.
checking NEON sconv_s16le_to_float(1020)
NEON: 1800 usec.
ref: 10254 usec.

---
 src/Makefile.am            |    2 +-
 src/pulsecore/sconv_neon.c |  188 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+), 1 deletions(-)
 create mode 100644 src/pulsecore/sconv_neon.c

diff --git a/src/Makefile.am b/src/Makefile.am
index a6f9640..497618a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -826,7 +826,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
 		pulsecore/svolume_neon.c \
 		pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \
 		pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \
-		pulsecore/sconv_sse.c \
+		pulsecore/sconv_sse.c pulsecore/sconv_neon.c \
 		pulsecore/sconv.c pulsecore/sconv.h \
 		pulsecore/shared.c pulsecore/shared.h \
 		pulsecore/sink-input.c pulsecore/sink-input.h \
diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c
new file mode 100644
index 0000000..c21d8e3
--- /dev/null
+++ b/src/pulsecore/sconv_neon.c
@@ -0,0 +1,188 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with PulseAudio; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+
+#include <pulsecore/macro.h>
+#include <pulsecore/endianmacros.h>
+
+#include "cpu-arm.h"
+#include "sconv.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+#define RUN_TEST
+
+static void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
+    unsigned i;
+
+    const float32x4_t plusone4 = vdupq_n_f32(1.0f);
+    const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
+    const float32x4_t half4 = vdupq_n_f32(0.5f);
+    const float32x4_t scale4 = vdupq_n_f32(32767.0f);
+    const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
+
+    for (i = 0; i < (n & ~3); i += 4) {
+        const float32x4_t v4 =
+            vmulq_f32(vmaxq_f32(vminq_f32(vld1q_f32(&a[i]), plusone4) , minusone4), scale4);
+
+        const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
+                vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
+
+        vst1_s16(&b[i], vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4))));
+    }
+
+    // leftovers
+    for ( ; i < n; i++) {
+        b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
+    }
+}
+
+static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
+    unsigned i;
+    const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
+    const float invscale = 1.0f / 0x7FFF;
+
+    for (i = 0; i < (n & ~3); i += 4) {
+        int16x4_t v4 = vld1_s16(&a[i]);
+        vst1q_f32(&b[i], vmulq_f32(vcvtq_f32_s32(vmovl_s16(v4)), invscale4));
+    }
+
+    // leftovers
+    for ( ; i < n; i++) {
+        b[i] = a[i] * invscale;
+    }
+}
+
+#ifdef RUN_TEST
+#define SAMPLES 1019
+#define TIMES 300
+
+static void run_test_from(void) {
+    int16_t samples[SAMPLES];
+    int16_t samples_ref[SAMPLES];
+    float floats[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_convert_func_t func;
+
+    pa_log_debug("checking NEON sconv_s16le_from_float");
+
+    memset(samples_ref, 0, sizeof(samples_ref));
+    memset(samples, 0, sizeof(samples));
+
+    for (i = 0; i < SAMPLES; i++) {
+        floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
+    }
+
+    func = (pa_convert_func_t) pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
+    func(SAMPLES, floats, samples_ref);
+    pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (abs(samples[i] - samples_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
+                      floats[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        func(SAMPLES, floats, samples_ref);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_to(void) {
+    int16_t samples[SAMPLES];
+    float floats[SAMPLES];
+    float floats_ref[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_convert_func_t func;
+
+    pa_log_debug("checking NEON sconv_s16le_to_float");
+
+    memset(floats_ref, 0, sizeof(floats_ref));
+    memset(floats, 0, sizeof(float));
+
+    for (i = 0; i < SAMPLES; i++) {
+        samples[i] = rand() - RAND_MAX/2;
+    }
+
+    func = (pa_convert_func_t) pa_get_convert_to_float32ne_function(PA_SAMPLE_S16LE);
+    func(SAMPLES, samples, floats_ref);
+    pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
+                      samples[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        func(SAMPLES, samples, floats_ref);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* RUN_TEST */
+
+#endif /* defined(__ARM_NEON__) */
+
+void pa_convert_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+    run_test_from();
+    run_test_to();
+#endif
+
+    pa_log_info("Initialising ARM NEON optimized conversions.");
+    pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_neon);
+    pa_set_convert_to_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_to_f32ne_neon);
+
+#endif /* defined (__ARM_NEON__) */
+}
-- 
1.7.4.1