From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> v3: * fix test code: init float and int map_table * different code path for Cortex-A8 and later (-A9, A15, unknown) * convert from intrinsics to inline assembly v2: * add ARM NEON stereo-to-mono remapping code * static __attribute__ ((noinline)) is necessary to prevent inlining and work around gcc 4.6 ICE, see https://bugs.launchpad.net/bugs/936863 * call test code, the reference implementation is obtained using pa_get_init_remap_func() * remove check for NEON flags v1: * ARM NEON mono-to-stereo remapping code compiled with Ubuntu/Linaro gcc 4.6.3: arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon runtime on beagle-xm: D: [pulseaudio] remap_neon.c: checking NEON remap_stereo_to_mono(float) I: [pulseaudio] remap.c: Using stereo to mono remapping I: [pulseaudio] remap_neon.c: NEON: 3082 usec. I: [pulseaudio] remap_neon.c: ref: 24201 usec. D: [pulseaudio] remap_neon.c: checking NEON remap_stereo_to_mono(s16) I: [pulseaudio] remap.c: Using stereo to mono remapping I: [pulseaudio] remap_neon.c: NEON: 1190 usec. I: [pulseaudio] remap_neon.c: ref: 5615 usec. D: [pulseaudio] remap_neon.c: checking NEON remap_mono_to_stereo(float) I: [pulseaudio] remap.c: Using mono to stereo remapping I: [pulseaudio] remap_neon.c: NEON/A8: 2350 usec. I: [pulseaudio] remap_neon.c: NEON/A9: 4730 usec. I: [pulseaudio] remap_neon.c: ref: 3601 usec. D: [pulseaudio] remap_neon.c: checking NEON remap_mono_to_stereo(s16) I: [pulseaudio] remap.c: Using mono to stereo remapping I: [pulseaudio] remap_neon.c: NEON: 1403 usec. I: [pulseaudio] remap_neon.c: ref: 3724 usec. I: [pulseaudio] remap_neon.c: Initialising ARM NEON optimized remappers. Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com> --- src/Makefile.am | 1 + src/pulsecore/remap_neon.c | 528 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 529 insertions(+), 0 deletions(-) create mode 100644 src/pulsecore/remap_neon.c diff --git a/src/Makefile.am b/src/Makefile.am index 63ad837..df25efc 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -819,6 +819,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \ pulsecore/play-memchunk.c pulsecore/play-memchunk.h \ pulsecore/remap.c pulsecore/remap.h \ pulsecore/remap_mmx.c pulsecore/remap_sse.c \ + pulsecore/remap_neon.c \ pulsecore/resampler.c pulsecore/resampler.h \ pulsecore/rtpoll.c pulsecore/rtpoll.h \ pulsecore/sample-util.c pulsecore/sample-util.h \ diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c new file mode 100644 index 0000000..0ecced1 --- /dev/null +++ b/src/pulsecore/remap_neon.c @@ -0,0 +1,528 @@ +/*** + This file is part of PulseAudio. + + Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/rtclock.h> +#include <pulse/sample.h> +#include <pulsecore/log.h> +#include <pulsecore/macro.h> + +#include "cpu-arm.h" +#include "remap.h" + +#if defined(__ARM_NEON__) + +#include <math.h> +#include <arm_neon.h> + +#define RUN_TEST + +static inline void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) { + int i = n & 3; + + asm volatile ( + "mov %[n], %[n], lsr #2\n\t" + "1:\n\t" + "vld1.32 {q0}, [%[src]]!\n\t" + "vmov q1, q0\n\t" + "subs %[n], %[n], #1\n\t" + "vst2.32 {q0,q1}, [%[dst]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static inline void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) { + int i = n & 1; + + asm volatile ( + "mov %[n], %[n], lsr #1\n\t" + "1:\n\t" + "ldm %[src]!, {r4,r6}\n\t" + "mov r5, r4\n\t" + "mov r7, r6\n\t" + "subs %[n], %[n], #1\n\t" + "stm %[dst]!, {r4-r7}\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : /* input operands */ + : "memory", "cc", "r4", "r5", "r6", "r7" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static inline void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 7; + + asm volatile ( + "mov %[n], %[n], lsr #3\n\t" + "1:\n\t" + "vld1.16 {q0}, [%[src]]!\n\t" + "vmov q1, q0\n\t" + "subs %[n], %[n], #1\n\t" + "vst2.16 {q0,q1}, [%[dst]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + mono_to_stereo_float_neon_a9(dst, src, n); + break; + case PA_SAMPLE_S16NE: + mono_to_stereo_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + mono_to_stereo_float_neon_a8(dst, src, n); + break; + case PA_SAMPLE_S16NE: + mono_to_stereo_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static inline void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) { + int i = n & 3; + + asm volatile ( + "mov %[n], %[n], lsr #2\n\t" + "1:\n\t" + "vld2.32 {q0,q1}, [%[src]]!\n\t" + "vadd.f32 q0, q0, q1\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.32 {q0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : /* input operands */ + : "memory", "cc" /* clobber list */ + ); + + while (i--) { + dst[0] = src[0] + src[1]; + src += 2; + dst++; + } +} + +static inline void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 7; + + asm volatile ( + "mov %[n], %[n], lsr #3\n\t" + "1:\n\t" + "vld2.16 {q0,q1}, [%[src]]!\n\t" + "vadd.s16 q0, q0, q1\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.16 {q0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : /* input operands */ + : "memory", "cc" /* clobber list */ + ); + + while (i--) { + dst[0] = src[0] + src[1]; + src += 2; + dst++; + } +} + +static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + stereo_to_mono_float_neon(dst, src, n); + break; + case PA_SAMPLE_S16NE: + stereo_to_mono_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +#ifdef RUN_TEST +#define SAMPLES 1019 +#define TIMES 1000 + +static void run_test_float_mono_to_stereo(void) { + float stereo[2*SAMPLES]; + float stereo_ref[2*SAMPLES]; + float mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_remap_t remap; + pa_sample_spec iss, oss; + + pa_init_remap_func_t remap_init_func = pa_get_init_remap_func(); + + pa_log_debug("checking NEON remap_mono_to_stereo(float)"); + + memset(stereo_ref, 0, sizeof(stereo_ref)); + memset(stereo, 0, sizeof(stereo)); + + for (i = 0; i < SAMPLES; i++) { + mono[i] = rand()/(float) RAND_MAX - 0.5f; + } + + sf = PA_SAMPLE_FLOAT32NE; + remap.format = &sf; + iss.format = PA_SAMPLE_FLOAT32NE; + iss.channels = 1; + oss.format = PA_SAMPLE_FLOAT32NE; + oss.channels = 2; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[1][0] = 1.0; + remap.map_table_i[0][0] = 0x10000; + remap.map_table_i[1][0] = 0x10000; + remap_init_func(&remap); + if (!remap.do_remap) { + pa_log_debug("no reference remapping function, abort test"); + return; + } + + remap.do_remap(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_neon_a9(&remap, stereo, mono, SAMPLES); + for (i = 0; i < 2*SAMPLES; i++) { + if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("A9 %d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i], + mono[i/2]); + break; + } + } + + remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES); + for (i = 0; i < 2*SAMPLES; i++) { + if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("A8 %d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i], + mono[i/2]); + break; + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON/A8: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon_a9(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON/A9: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap.do_remap(&remap, stereo_ref, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_s16_mono_to_stereo(void) { + int16_t stereo[2*SAMPLES]; + int16_t stereo_ref[2*SAMPLES]; + int16_t mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_remap_t remap; + pa_sample_spec iss, oss; + + pa_init_remap_func_t remap_init_func = pa_get_init_remap_func(); + + pa_log_debug("checking NEON remap_mono_to_stereo(s16)"); + + memset(stereo_ref, 0, sizeof(stereo_ref)); + memset(stereo, 0, sizeof(stereo)); + + for (i = 0; i < SAMPLES; i++) { + mono[i] = rand() - RAND_MAX/2; + } + + sf = PA_SAMPLE_S16NE; + remap.format = &sf; + iss.format = PA_SAMPLE_S16NE; + iss.channels = 1; + oss.format = PA_SAMPLE_S16NE; + oss.channels = 2; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[1][0] = 1.0; + remap.map_table_i[0][0] = 0x10000; + remap.map_table_i[1][0] = 0x10000; + remap_init_func(&remap); + if (!remap.do_remap) { + pa_log_debug("no reference remapping function, abort test"); + return; + } + + remap.do_remap(&remap, stereo_ref, mono, SAMPLES); + + remap_mono_to_stereo_neon_a9(&remap, stereo, mono, SAMPLES); + for (i = 0; i < 2*SAMPLES; i++) { + if (abs(stereo[i] - stereo_ref[i]) > 0) { + pa_log_debug("A9 %d: %d != %d (%d)", i, stereo[i], stereo_ref[i], + mono[i/2]); + break; + } + } + + remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES); + for (i = 0; i < 2*SAMPLES; i++) { + if (abs(stereo[i] - stereo_ref[i]) > 0) { + pa_log_debug("A8 %d: %d != %d (%d)", i, stereo[i], stereo_ref[i], + mono[i/2]); + break; + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon_a8(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap.do_remap(&remap, stereo_ref, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_float_stereo_to_mono(void) { + float stereo[2*SAMPLES]; + float mono_ref[SAMPLES]; + float mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_remap_t remap; + pa_sample_spec iss, oss; + + pa_init_remap_func_t remap_init_func = pa_get_init_remap_func(); + + pa_log_debug("checking NEON remap_stereo_to_mono(float)"); + + memset(mono_ref, 0, sizeof(mono_ref)); + memset(mono, 0, sizeof(mono)); + + for (i = 0; i < 2*SAMPLES; i++) { + stereo[i] = rand()/(float) RAND_MAX - 0.5f; + } + + sf = PA_SAMPLE_FLOAT32NE; + remap.format = &sf; + iss.format = PA_SAMPLE_FLOAT32NE; + iss.channels = 2; + oss.format = PA_SAMPLE_FLOAT32NE; + oss.channels = 1; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[0][1] = 1.0; + remap.map_table_i[0][0] = 0x10000; + remap.map_table_i[0][1] = 0x10000; + remap_init_func(&remap); + if (!remap.do_remap) { + pa_log_debug("no reference remapping function, abort test"); + return; + } + + remap.do_remap(&remap, mono_ref, stereo, SAMPLES); + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(mono[i] - mono_ref[i]) > 0.00001) { + pa_log_debug("%d: %.3f != %.3f (%.3f %.3f)", i, mono[i], mono_ref[i], + stereo[2*i+0], stereo[2*i+1]); + break; + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap.do_remap(&remap, mono_ref, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_s16_stereo_to_mono(void) { + int16_t stereo[2*SAMPLES]; + int16_t mono_ref[SAMPLES]; + int16_t mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_remap_t remap; + pa_sample_spec iss, oss; + + pa_init_remap_func_t remap_init_func = pa_get_init_remap_func(); + + pa_log_debug("checking NEON remap_stereo_to_mono(s16)"); + + memset(mono_ref, 0, sizeof(mono_ref)); + memset(mono, 0, sizeof(mono)); + + for (i = 0; i < 2*SAMPLES; i++) { + stereo[i] = rand() - RAND_MAX/2; + } + + sf = PA_SAMPLE_S16NE; + remap.format = &sf; + iss.format = PA_SAMPLE_S16NE; + iss.channels = 2; + oss.format = PA_SAMPLE_S16NE; + oss.channels = 1; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[0][1] = 1.0; + remap.map_table_i[0][0] = 0x10000; + remap.map_table_i[0][1] = 0x10000; + remap_init_func(&remap); + if (!remap.do_remap) { + pa_log_debug("no reference remapping function, abort test"); + return; + } + + remap.do_remap(&remap, mono_ref, stereo, SAMPLES); + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + + for (i = 0; i < SAMPLES; i++) { + if (abs(mono[i] - mono_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%d %d)", i, mono[i], mono_ref[i], + stereo[2*i+0], stereo[2*i+1]); + break; + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap.do_remap(&remap, mono_ref, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif /* RUN_TEST */ + +static pa_cpu_arm_flag_t arm_flags; + +static void init_remap_neon(pa_remap_t *m) { + unsigned n_oc, n_ic; + + n_oc = m->o_ss->channels; + n_ic = m->i_ss->channels; + + /* find some common channel remappings, fall back to full matrix operation. */ + if (n_ic == 1 && n_oc == 2 && + m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) { + if (arm_flags & PA_CPU_ARM_CORTEX_A8) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a8; + pa_log_info("Using ARM NEON/A8 mono to stereo remapping"); + } + else { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a9; + pa_log_info("Using ARM NEON mono to stereo remapping"); + } + } + else if (n_ic == 2 && n_oc == 1 && + m->map_table_f[0][0] >= 1.0 && m->map_table_f[0][1] >= 1.0) { + m->do_remap = (pa_do_remap_func_t) remap_stereo_to_mono_neon; + pa_log_info("Using ARM NEON stereo to mono remapping"); + } +} +#endif /* defined (__ARM_NEON__) */ + +void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) { +#if defined (__ARM_NEON__) + +#ifdef RUN_TEST + run_test_float_stereo_to_mono(); + run_test_s16_stereo_to_mono(); + run_test_float_mono_to_stereo(); + run_test_s16_mono_to_stereo(); +#endif + + pa_log_info("Initialising ARM NEON optimized remappers."); + arm_flags = flags; + pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon); + +#endif /* defined (__ARM_NEON__) */ +} -- 1.7.5.4