[PATCH 06/10] core: Add ARM NEON optimized remapping code

pmeerw@xxxxxxxxxx (Peter Meerwald) · Fri, 29 Mar 2013 16:56:47 +0100

From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx>

v5:
* 4-channel remapping
* use vrhadd instruction, fix int16 overflow for to-mono case
v4:
* fix for sample length < 4
v3:
* fix test code: init float and int map_table
* different code path for Cortex-A8 and later (-A9, A15, unknown)
* convert from intrinsics to inline assembly
v2:
* add ARM NEON stereo-to-mono remapping code
* static __attribute__ ((noinline)) is necessary to prevent inlining and
  work around gcc 4.6 ICE, see https://bugs.launchpad.net/bugs/936863
* call test code, the reference implementation is obtained using
  pa_get_init_remap_func()
* remove check for NEON flags
v1:
* ARM NEON mono-to-stereo remapping code

compiled with Ubuntu/Linaro gcc 4.6.3:
arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon

runtime on beagle-xm measured by cpu-test:
Checking NEON remap (float, mono->stereo)
func: 517000 usec (avg: 5170, min = 4730, max = 6073, stddev = 486.311).
orig: 641082 usec (avg: 6410.82, min = 6317, max = 6927, stddev = 72.8961).
Checking NEON remap (float, mono->ch4)
func: 1120299 usec (avg: 11203, min = 8911, max = 11871, stddev = 887.938).
orig: 2112855 usec (avg: 21128.5, min = 20477, max = 21606, stddev = 148.112).
Checking NEON remap (s16, mono->stereo)
func: 253905 usec (avg: 2539.05, min = 2441, max = 2868, stddev = 60.532).
orig: 429018 usec (avg: 4290.18, min = 4211, max = 4578, stddev = 58.158).
Checking NEON remap (s16, mono->ch4)
func: 518708 usec (avg: 5187.08, min = 4700, max = 5707, stddev = 328.364).
orig: 782318 usec (avg: 7823.18, min = 7751, max = 8331, stddev = 89.0162).
Checking NEON remap (float, stereo->mono)
func: 488526 usec (avg: 4885.26, min = 4852, max = 5188, stddev = 40.6841).
orig: 4052827 usec (avg: 40528.3, min = 40405, max = 40955, stddev = 117.413).
Checking NEON remap (float, ch4->mono)
func: 1300721 usec (avg: 13007.2, min = 12939, max = 13611, stddev = 94.13).
orig: 7937749 usec (avg: 79377.5, min = 79223, max = 79956, stddev = 163.169).
Checking NEON remap (s16, stereo->mono)
func: 165620 usec (avg: 1656.2, min = 1587, max = 2136, stddev = 66.9352).
orig: 1128600 usec (avg: 11286, min = 11230, max = 11719, stddev = 75.9534).
Checking NEON remap (s16, ch4->mono)
func: 450013 usec (avg: 4500.13, min = 4425, max = 4852, stddev = 51.7634).
orig: 1537200 usec (avg: 15372, min = 15289, max = 15869, stddev = 88.0164).

Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>
---
 src/Makefile.am            |    6 +-
 src/pulsecore/remap_neon.c |  403 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 407 insertions(+), 2 deletions(-)
 create mode 100644 src/pulsecore/remap_neon.c

diff --git a/src/Makefile.am b/src/Makefile.am
index 4b3efa3..915c177 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -903,12 +903,14 @@ libpulsecore_ at PA_MAJORMINOR@_la_LDFLAGS = $(AM_LDFLAGS) -avoid-version
 libpulsecore_ at PA_MAJORMINOR@_la_LIBADD = $(AM_LIBADD) $(LIBLTDL) $(LIBSAMPLERATE_LIBS) $(LIBSPEEX_LIBS) $(LIBSNDFILE_LIBS) $(WINSOCK_LIBS) $(LTLIBICONV) libpulsecommon- at PA_MAJORMINOR@.la libpulse.la libpulsecore-foreign.la
 
 if HAVE_NEON
-noinst_LTLIBRARIES += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la
+noinst_LTLIBRARIES += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la libpulsecore_remap_neon.la
 libpulsecore_sconv_neon_la_SOURCES = pulsecore/sconv_neon.c
 libpulsecore_sconv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS)
 libpulsecore_mix_neon_la_SOURCES = pulsecore/mix_neon.c
 libpulsecore_mix_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS)
-libpulsecore_ at PA_MAJORMINOR@_la_LIBADD += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la
+libpulsecore_remap_neon_la_SOURCES = pulsecore/remap_neon.c
+libpulsecore_remap_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS)
+libpulsecore_ at PA_MAJORMINOR@_la_LIBADD += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la libpulsecore_remap_neon.la
 endif
 
 if HAVE_ORC
diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c
new file mode 100644
index 0000000..f690411
--- /dev/null
+++ b/src/pulsecore/remap_neon.c
@@ -0,0 +1,403 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2013 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/sample.h>
+#include <pulse/volume.h>
+#include <pulsecore/log.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "remap.h"
+
+#include <arm_neon.h>
+
+static inline void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) {
+    int i = n & 3;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #2      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:                                 \n\t"
+        "vld1.32    {q0}, [%[src]]!         \n\t"
+        "vmov       q1, q0                  \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst2.32    {q0,q1}, [%[dst]]!      \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static inline void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) {
+    int i = n & 1;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #1      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:                                 \n\t"
+        "ldm        %[src]!, {r4,r6}        \n\t"
+        "mov        r5, r4                  \n\t"
+        "mov        r7, r6                  \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "stm        %[dst]!, {r4-r7}        \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "r4", "r5", "r6", "r7" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static inline void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 7;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #3      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:                                 \n\t"
+        "vld1.16    {q0}, [%[src]]!         \n\t"
+        "vmov       q1, q0                  \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst2.16    {q0,q1}, [%[dst]]!      \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            mono_to_stereo_float_neon_a9(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            mono_to_stereo_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            mono_to_stereo_float_neon_a8(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            mono_to_stereo_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static inline void mono_to_ch4_float_neon(float *dst, const float *src, unsigned n) {
+    int i = n & 1;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #1      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:                                 \n\t"
+        "vld1.32    {d0}, [%[src]]!         \n\t"
+        "vdup.f32   q1, d0[0]               \n\t"
+        "vdup.f32   q2, d0[1]               \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst1.32    {q1,q2}, [%[dst]]!      \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "q0", "q1", "q2" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        src++;
+        dst += 4;
+    }
+}
+
+static inline void mono_to_ch4_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 3;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #2      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:                                 \n\t"
+        "vld1.16    {d0}, [%[src]]!         \n\t"
+        "vdup.s16   d1, d0[1]               \n\t"
+        "vdup.s16   d2, d0[2]               \n\t"
+        "vdup.s16   d3, d0[3]               \n\t"
+        "vdup.s16   d0, d0[0]               \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst1.16    {d0,d1,d2,d3}, [%[dst]]!\n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "d0", "d1", "d2", "d3" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        src++;
+        dst += 4;
+    }
+}
+
+static void remap_mono_to_ch4_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            mono_to_ch4_float_neon(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            mono_to_ch4_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static inline void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) {
+    int i = n & 3;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #2      \n\t"
+        "beq        2f                      \n\t"
+
+        "vdup.f32   q2, %[halve]            \n\t"
+
+        "1:                                 \n\t"
+        "vld2.32    {q0,q1}, [%[src]]!      \n\t"
+        "vadd.f32   q0, q0, q1              \n\t"
+        "vmul.f32   q0, q0, q2              \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst1.32    {q0}, [%[dst]]!         \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : [halve] "r" (0.5f) /* input operands */
+        : "memory", "cc", "q0", "q1", "q2" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = (src[0] + src[1])*0.5f;
+        src += 2;
+        dst++;
+    }
+}
+
+static inline void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 7;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #3      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:\n\t"
+        "vld2.16    {q0,q1}, [%[src]]!      \n\t"
+        "vrhadd.s16 q0, q0, q1              \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst1.16    {q0}, [%[dst]]!         \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+
+   while (i--) {
+        dst[0] = (src[0] + src[1])/2;
+        src += 2;
+        dst++;
+    }
+}
+
+static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            stereo_to_mono_float_neon(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            stereo_to_mono_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static inline void ch4_to_mono_float_neon(float *dst, const float *src, unsigned n) {
+    int i = n & 1;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #1      \n\t"
+        "beq        2f                      \n\t"
+
+        "vdup.f32   d4, %[quart]            \n\t"
+
+        "1:\n\t"
+        "vld4.32    {d0,d1,d2,d3}, [%[src]]!\n\t"
+        "vadd.f32   d0, d0, d1              \n\t"
+        "vadd.f32   d2, d2, d3              \n\t"
+        "vadd.f32   d0, d0, d2              \n\t"
+        "vmul.f32   d0, d0, d4              \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst1.32    {d0}, [%[dst]]!         \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : [quart] "r" (0.25f) /* input operands */
+        : "memory", "cc", "d0", "d1", "d2", "d3", "d4" /* clobber list */
+    );
+
+    while (i--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
+        src += 4;
+        dst++;
+    }
+}
+
+static inline void ch4_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 3;
+
+    __asm__ __volatile__ (
+        "movs       %[n], %[n], lsr #2      \n\t"
+        "beq        2f                      \n\t"
+
+        "1:\n\t"
+        "vld4.16    {d0,d1,d2,d3}, [%[src]]!\n\t"
+        "vrhadd.s16 d0, d0, d1              \n\t"
+        "vrhadd.s16 d2, d2, d3              \n\t"
+        "vrhadd.s16 d0, d0, d2              \n\t"
+        "subs       %[n], %[n], #1          \n\t"
+        "vst1.16    {d0}, [%[dst]]!         \n\t"
+        "bgt        1b                      \n\t"
+
+        "2:                                 \n\t"
+
+        : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
+        : /* input operands */
+        : "memory", "cc", "d0", "d1", "d2", "d3" /* clobber list */
+    );
+
+   while (i--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
+        src += 4;
+        dst++;
+    }
+}
+
+static void remap_ch4_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            ch4_to_mono_float_neon(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            ch4_to_mono_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+static pa_cpu_arm_flag_t arm_flags;
+
+static void init_remap_neon(pa_remap_t *m) {
+    unsigned n_oc, n_ic;
+
+    n_oc = m->o_ss->channels;
+    n_ic = m->i_ss->channels;
+
+    /* find some common channel remappings, fall back to full matrix operation. */
+    if (n_ic == 1 && n_oc == 2 &&
+            m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM) {
+        if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a8;
+            pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
+        }
+        else {
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a9;
+            pa_log_info("Using ARM NEON/A9 mono to stereo remapping");
+        }
+    } else if (n_ic == 1 && n_oc == 4 &&
+            m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM &&
+            m->map_table_i[2][0] == PA_VOLUME_NORM && m->map_table_i[3][0] == PA_VOLUME_NORM) {
+        m->do_remap = (pa_do_remap_func_t) remap_mono_to_ch4_neon;
+        pa_log_info("Using ARM NEON mono to 4-channel remapping");
+    } else if (n_ic == 2 && n_oc == 1 &&
+            m->map_table_i[0][0] == PA_VOLUME_HALF && m->map_table_i[0][1] == PA_VOLUME_HALF) {
+        m->do_remap = (pa_do_remap_func_t) remap_stereo_to_mono_neon;
+        pa_log_info("Using ARM NEON stereo to mono remapping");
+    } else if (n_ic == 4 && n_oc == 1 &&
+            m->map_table_i[0][0] == PA_VOLUME_QUARTER && m->map_table_i[0][1] == PA_VOLUME_QUARTER &&
+            m->map_table_i[0][2] == PA_VOLUME_QUARTER && m->map_table_i[0][3] == PA_VOLUME_QUARTER) {
+        m->do_remap = (pa_do_remap_func_t) remap_ch4_to_mono_neon;
+        pa_log_info("Using ARM NEON 4-channel to mono remapping");
+    }
+}
+
+void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
+    pa_log_info("Initialising ARM NEON optimized remappers.");
+    arm_flags = flags;
+    pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
+}
-- 
1.7.9.5