[PATCH 4/6 v3] core: Add ARM NEON optimized mono-to-stereo/stereo-to-mono remapping code

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx>

v3:
* fix test code: init float and int map_table
* different code path for Cortex-A8 and later (-A9, A15, unknown)
* convert from intrinsics to inline assembly
v2:
* add ARM NEON stereo-to-mono remapping code
* static __attribute__ ((noinline)) is necessary to prevent inlining and
  work around gcc 4.6 ICE, see https://bugs.launchpad.net/bugs/936863
* call test code, the reference implementation is obtained using
  pa_get_init_remap_func()
* remove check for NEON flags
v1:
* ARM NEON mono-to-stereo remapping code

compiled with Ubuntu/Linaro gcc 4.6.3:
arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon

runtime on beagle-xm:

D: [pulseaudio] remap_neon.c: checking NEON remap_stereo_to_mono(float)
I: [pulseaudio] remap.c: Using stereo to mono remapping
I: [pulseaudio] remap_neon.c: NEON: 3082 usec.
I: [pulseaudio] remap_neon.c: ref: 24201 usec.
D: [pulseaudio] remap_neon.c: checking NEON remap_stereo_to_mono(s16)
I: [pulseaudio] remap.c: Using stereo to mono remapping
I: [pulseaudio] remap_neon.c: NEON: 1190 usec.
I: [pulseaudio] remap_neon.c: ref: 5615 usec.
D: [pulseaudio] remap_neon.c: checking NEON remap_mono_to_stereo(float)
I: [pulseaudio] remap.c: Using mono to stereo remapping
I: [pulseaudio] remap_neon.c: NEON/A8: 2350 usec.
I: [pulseaudio] remap_neon.c: NEON/A9: 4730 usec.
I: [pulseaudio] remap_neon.c: ref: 3601 usec.
D: [pulseaudio] remap_neon.c: checking NEON remap_mono_to_stereo(s16)
I: [pulseaudio] remap.c: Using mono to stereo remapping
I: [pulseaudio] remap_neon.c: NEON: 1403 usec.
I: [pulseaudio] remap_neon.c: ref: 3724 usec.
I: [pulseaudio] remap_neon.c: Initialising ARM NEON optimized remappers.

Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>
---
 src/Makefile.am            |    1 +
 src/pulsecore/remap_neon.c |  528 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 529 insertions(+), 0 deletions(-)
 create mode 100644 src/pulsecore/remap_neon.c

diff --git a/src/Makefile.am b/src/Makefile.am
index 63ad837..df25efc 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -819,6 +819,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
 		pulsecore/play-memchunk.c pulsecore/play-memchunk.h \
 		pulsecore/remap.c pulsecore/remap.h \
 		pulsecore/remap_mmx.c pulsecore/remap_sse.c \
+		pulsecore/remap_neon.c \
 		pulsecore/resampler.c pulsecore/resampler.h \
 		pulsecore/rtpoll.c pulsecore/rtpoll.h \
 		pulsecore/sample-util.c pulsecore/sample-util.h \
diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c
new file mode 100644
index 0000000..0ecced1
--- /dev/null
+++ b/src/pulsecore/remap_neon.c
@@ -0,0 +1,528 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+#include <pulse/sample.h>
+#include <pulsecore/log.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "remap.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+#define RUN_TEST
+
+static inline void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) {
+    int i = n & 3;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #2\n\t"
+    "1:\n\t"
+    "vld1.32    {q0}, [%[src]]!\n\t"
+    "vmov       q1, q0\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst2.32    {q0,q1}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : /* input operands */
+    : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static inline void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) {
+    int i = n & 1;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #1\n\t"
+    "1:\n\t"
+    "ldm        %[src]!, {r4,r6}\n\t"
+    "mov        r5, r4\n\t"
+    "mov        r7, r6\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "stm        %[dst]!, {r4-r7}\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : /* input operands */
+    : "memory", "cc", "r4", "r5", "r6", "r7" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static inline void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 7;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #3\n\t"
+    "1:\n\t"
+    "vld1.16	{q0}, [%[src]]!\n\t"
+    "vmov	    q1, q0\n\t"
+    "subs	    %[n], %[n], #1\n\t"
+    "vst2.16	{q0,q1}, [%[dst]]!\n\t"
+    "bgt	    1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : /* input operands */
+    : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            mono_to_stereo_float_neon_a9(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            mono_to_stereo_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            mono_to_stereo_float_neon_a8(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            mono_to_stereo_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static inline void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) {
+    int i = n & 3;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #2\n\t"
+    "1:\n\t"
+    "vld2.32    {q0,q1}, [%[src]]!\n\t"
+    "vadd.f32    q0, q0, q1\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst1.32    {q0}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : /* input operands */
+    : "memory", "cc" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = src[0] + src[1];
+        src += 2;
+        dst++;
+    }
+}
+
+static inline void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 7;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #3\n\t"
+    "1:\n\t"
+    "vld2.16    {q0,q1}, [%[src]]!\n\t"
+    "vadd.s16    q0, q0, q1\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst1.16    {q0}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : /* input operands */
+    : "memory", "cc" /* clobber list */
+    );
+
+   while (i--) {
+        dst[0] = src[0] + src[1];
+        src += 2;
+        dst++;
+    }
+}
+
+static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            stereo_to_mono_float_neon(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            stereo_to_mono_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+#ifdef RUN_TEST
+#define SAMPLES 1019
+#define TIMES 1000
+
+static void run_test_float_mono_to_stereo(void) {
+    float stereo[2*SAMPLES];
+    float stereo_ref[2*SAMPLES];
+    float mono[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_sample_format_t sf;
+    pa_remap_t remap;
+    pa_sample_spec iss, oss;
+
+    pa_init_remap_func_t remap_init_func = pa_get_init_remap_func();
+
+    pa_log_debug("checking NEON remap_mono_to_stereo(float)");
+
+    memset(stereo_ref, 0, sizeof(stereo_ref));
+    memset(stereo, 0, sizeof(stereo));
+
+    for (i = 0; i < SAMPLES; i++) {
+        mono[i] = rand()/(float) RAND_MAX - 0.5f;
+    }
+
+    sf = PA_SAMPLE_FLOAT32NE;
+    remap.format = &sf;
+    iss.format = PA_SAMPLE_FLOAT32NE;
+    iss.channels = 1;
+    oss.format = PA_SAMPLE_FLOAT32NE;
+    oss.channels = 2;
+    remap.i_ss = &iss;
+    remap.o_ss = &oss;
+    remap.map_table_f[0][0] = 1.0;
+    remap.map_table_f[1][0] = 1.0;
+    remap.map_table_i[0][0] = 0x10000;
+    remap.map_table_i[1][0] = 0x10000;
+    remap_init_func(&remap);
+    if (!remap.do_remap) {
+        pa_log_debug("no reference remapping function, abort test");
+        return;
+    }
+
+    remap.do_remap(&remap, stereo_ref, mono, SAMPLES);
+    remap_mono_to_stereo_neon_a9(&remap, stereo, mono, SAMPLES);
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
+            pa_log_debug("A9 %d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
+                      mono[i/2]);
+            break;
+        }
+    }
+
+    remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES);
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
+            pa_log_debug("A8 %d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
+                      mono[i/2]);
+            break;
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON/A8: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon_a9(&remap, stereo, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON/A9: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap.do_remap(&remap, stereo_ref, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16_mono_to_stereo(void) {
+    int16_t stereo[2*SAMPLES];
+    int16_t stereo_ref[2*SAMPLES];
+    int16_t mono[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_sample_format_t sf;
+    pa_remap_t remap;
+    pa_sample_spec iss, oss;
+
+    pa_init_remap_func_t remap_init_func = pa_get_init_remap_func();
+
+    pa_log_debug("checking NEON remap_mono_to_stereo(s16)");
+
+    memset(stereo_ref, 0, sizeof(stereo_ref));
+    memset(stereo, 0, sizeof(stereo));
+
+    for (i = 0; i < SAMPLES; i++) {
+        mono[i] = rand() - RAND_MAX/2;
+    }
+
+    sf = PA_SAMPLE_S16NE;
+    remap.format = &sf;
+    iss.format = PA_SAMPLE_S16NE;
+    iss.channels = 1;
+    oss.format = PA_SAMPLE_S16NE;
+    oss.channels = 2;
+    remap.i_ss = &iss;
+    remap.o_ss = &oss;
+    remap.map_table_f[0][0] = 1.0;
+    remap.map_table_f[1][0] = 1.0;
+    remap.map_table_i[0][0] = 0x10000;
+    remap.map_table_i[1][0] = 0x10000;
+    remap_init_func(&remap);
+    if (!remap.do_remap) {
+        pa_log_debug("no reference remapping function, abort test");
+        return;
+    }
+
+    remap.do_remap(&remap, stereo_ref, mono, SAMPLES);
+
+    remap_mono_to_stereo_neon_a9(&remap, stereo, mono, SAMPLES);
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (abs(stereo[i] - stereo_ref[i]) > 0) {
+            pa_log_debug("A9 %d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
+                      mono[i/2]);
+            break;
+        }
+    }
+
+    remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES);
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (abs(stereo[i] - stereo_ref[i]) > 0) {
+            pa_log_debug("A8 %d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
+                      mono[i/2]);
+            break;
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap.do_remap(&remap, stereo_ref, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_float_stereo_to_mono(void) {
+    float stereo[2*SAMPLES];
+    float mono_ref[SAMPLES];
+    float mono[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_sample_format_t sf;
+    pa_remap_t remap;
+    pa_sample_spec iss, oss;
+
+    pa_init_remap_func_t remap_init_func = pa_get_init_remap_func();
+
+    pa_log_debug("checking NEON remap_stereo_to_mono(float)");
+
+    memset(mono_ref, 0, sizeof(mono_ref));
+    memset(mono, 0, sizeof(mono));
+
+    for (i = 0; i < 2*SAMPLES; i++) {
+        stereo[i] = rand()/(float) RAND_MAX - 0.5f;
+    }
+
+    sf = PA_SAMPLE_FLOAT32NE;
+    remap.format = &sf;
+    iss.format = PA_SAMPLE_FLOAT32NE;
+    iss.channels = 2;
+    oss.format = PA_SAMPLE_FLOAT32NE;
+    oss.channels = 1;
+    remap.i_ss = &iss;
+    remap.o_ss = &oss;
+    remap.map_table_f[0][0] = 1.0;
+    remap.map_table_f[0][1] = 1.0;
+    remap.map_table_i[0][0] = 0x10000;
+    remap.map_table_i[0][1] = 0x10000;
+    remap_init_func(&remap);
+    if (!remap.do_remap) {
+        pa_log_debug("no reference remapping function, abort test");
+        return;
+    }
+
+    remap.do_remap(&remap, mono_ref, stereo, SAMPLES);
+    remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (fabsf(mono[i] - mono_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.3f != %.3f (%.3f %.3f)", i, mono[i], mono_ref[i],
+                      stereo[2*i+0], stereo[2*i+1]);
+            break;
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap.do_remap(&remap, mono_ref, stereo, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16_stereo_to_mono(void) {
+    int16_t stereo[2*SAMPLES];
+    int16_t mono_ref[SAMPLES];
+    int16_t mono[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_sample_format_t sf;
+    pa_remap_t remap;
+    pa_sample_spec iss, oss;
+
+    pa_init_remap_func_t remap_init_func = pa_get_init_remap_func();
+
+    pa_log_debug("checking NEON remap_stereo_to_mono(s16)");
+
+    memset(mono_ref, 0, sizeof(mono_ref));
+    memset(mono, 0, sizeof(mono));
+
+    for (i = 0; i < 2*SAMPLES; i++) {
+        stereo[i] = rand() - RAND_MAX/2;
+    }
+
+    sf = PA_SAMPLE_S16NE;
+    remap.format = &sf;
+    iss.format = PA_SAMPLE_S16NE;
+    iss.channels = 2;
+    oss.format = PA_SAMPLE_S16NE;
+    oss.channels = 1;
+    remap.i_ss = &iss;
+    remap.o_ss = &oss;
+    remap.map_table_f[0][0] = 1.0;
+    remap.map_table_f[0][1] = 1.0;
+    remap.map_table_i[0][0] = 0x10000;
+    remap.map_table_i[0][1] = 0x10000;
+    remap_init_func(&remap);
+    if (!remap.do_remap) {
+        pa_log_debug("no reference remapping function, abort test");
+        return;
+    }
+
+    remap.do_remap(&remap, mono_ref, stereo, SAMPLES);
+    remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (abs(mono[i] - mono_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%d %d)", i, mono[i], mono_ref[i],
+                      stereo[2*i+0], stereo[2*i+1]);
+            break;
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap.do_remap(&remap, mono_ref, stereo, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* RUN_TEST */
+
+static pa_cpu_arm_flag_t arm_flags;
+
+static void init_remap_neon(pa_remap_t *m) {
+    unsigned n_oc, n_ic;
+
+    n_oc = m->o_ss->channels;
+    n_ic = m->i_ss->channels;
+
+    /* find some common channel remappings, fall back to full matrix operation. */
+    if (n_ic == 1 && n_oc == 2 &&
+            m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) {
+        if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a8;
+            pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
+        }
+        else {
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a9;
+            pa_log_info("Using ARM NEON mono to stereo remapping");
+        }
+    }
+    else if (n_ic == 2 && n_oc == 1 &&
+            m->map_table_f[0][0] >= 1.0 && m->map_table_f[0][1] >= 1.0) {
+        m->do_remap = (pa_do_remap_func_t) remap_stereo_to_mono_neon;
+        pa_log_info("Using ARM NEON stereo to mono remapping");
+    }
+}
+#endif /* defined (__ARM_NEON__) */
+
+void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+    run_test_float_stereo_to_mono();
+    run_test_s16_stereo_to_mono();
+    run_test_float_mono_to_stereo();
+    run_test_s16_mono_to_stereo();
+#endif
+
+    pa_log_info("Initialising ARM NEON optimized remappers.");
+    arm_flags = flags;
+    pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
+
+#endif /* defined (__ARM_NEON__) */
+}
-- 
1.7.5.4



[Index of Archives]     [Linux Audio Users]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux