From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> v5: * 4-channel remapping * use vrhadd instruction, fix int16 overflow for to-mono case v4: * fix for sample length < 4 v3: * fix test code: init float and int map_table * different code path for Cortex-A8 and later (-A9, A15, unknown) * convert from intrinsics to inline assembly v2: * add ARM NEON stereo-to-mono remapping code * static __attribute__ ((noinline)) is necessary to prevent inlining and work around gcc 4.6 ICE, see https://bugs.launchpad.net/bugs/936863 * call test code, the reference implementation is obtained using pa_get_init_remap_func() * remove check for NEON flags v1: * ARM NEON mono-to-stereo remapping code compiled with Ubuntu/Linaro gcc 4.6.3: arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon runtime on beagle-xm measured by cpu-test: Checking NEON remap (float, mono->stereo) func: 517000 usec (avg: 5170, min = 4730, max = 6073, stddev = 486.311). orig: 641082 usec (avg: 6410.82, min = 6317, max = 6927, stddev = 72.8961). Checking NEON remap (float, mono->ch4) func: 1120299 usec (avg: 11203, min = 8911, max = 11871, stddev = 887.938). orig: 2112855 usec (avg: 21128.5, min = 20477, max = 21606, stddev = 148.112). Checking NEON remap (s16, mono->stereo) func: 253905 usec (avg: 2539.05, min = 2441, max = 2868, stddev = 60.532). orig: 429018 usec (avg: 4290.18, min = 4211, max = 4578, stddev = 58.158). Checking NEON remap (s16, mono->ch4) func: 518708 usec (avg: 5187.08, min = 4700, max = 5707, stddev = 328.364). orig: 782318 usec (avg: 7823.18, min = 7751, max = 8331, stddev = 89.0162). Checking NEON remap (float, stereo->mono) func: 488526 usec (avg: 4885.26, min = 4852, max = 5188, stddev = 40.6841). orig: 4052827 usec (avg: 40528.3, min = 40405, max = 40955, stddev = 117.413). Checking NEON remap (float, ch4->mono) func: 1300721 usec (avg: 13007.2, min = 12939, max = 13611, stddev = 94.13). orig: 7937749 usec (avg: 79377.5, min = 79223, max = 79956, stddev = 163.169). Checking NEON remap (s16, stereo->mono) func: 165620 usec (avg: 1656.2, min = 1587, max = 2136, stddev = 66.9352). orig: 1128600 usec (avg: 11286, min = 11230, max = 11719, stddev = 75.9534). Checking NEON remap (s16, ch4->mono) func: 450013 usec (avg: 4500.13, min = 4425, max = 4852, stddev = 51.7634). orig: 1537200 usec (avg: 15372, min = 15289, max = 15869, stddev = 88.0164). Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com> --- src/Makefile.am | 6 +- src/pulsecore/remap_neon.c | 403 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 407 insertions(+), 2 deletions(-) create mode 100644 src/pulsecore/remap_neon.c diff --git a/src/Makefile.am b/src/Makefile.am index 4b3efa3..915c177 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -903,12 +903,14 @@ libpulsecore_ at PA_MAJORMINOR@_la_LDFLAGS = $(AM_LDFLAGS) -avoid-version libpulsecore_ at PA_MAJORMINOR@_la_LIBADD = $(AM_LIBADD) $(LIBLTDL) $(LIBSAMPLERATE_LIBS) $(LIBSPEEX_LIBS) $(LIBSNDFILE_LIBS) $(WINSOCK_LIBS) $(LTLIBICONV) libpulsecommon- at PA_MAJORMINOR@.la libpulse.la libpulsecore-foreign.la if HAVE_NEON -noinst_LTLIBRARIES += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la +noinst_LTLIBRARIES += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la libpulsecore_remap_neon.la libpulsecore_sconv_neon_la_SOURCES = pulsecore/sconv_neon.c libpulsecore_sconv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS) libpulsecore_mix_neon_la_SOURCES = pulsecore/mix_neon.c libpulsecore_mix_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS) -libpulsecore_ at PA_MAJORMINOR@_la_LIBADD += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la +libpulsecore_remap_neon_la_SOURCES = pulsecore/remap_neon.c +libpulsecore_remap_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS) +libpulsecore_ at PA_MAJORMINOR@_la_LIBADD += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la libpulsecore_remap_neon.la endif if HAVE_ORC diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c new file mode 100644 index 0000000..f690411 --- /dev/null +++ b/src/pulsecore/remap_neon.c @@ -0,0 +1,403 @@ +/*** + This file is part of PulseAudio. + + Copyright 2013 Peter Meerwald <p.meerwald at bct-electronic.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/sample.h> +#include <pulse/volume.h> +#include <pulsecore/log.h> +#include <pulsecore/macro.h> + +#include "cpu-arm.h" +#include "remap.h" + +#include <arm_neon.h> + +static inline void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) { + int i = n & 3; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #2 \n\t" + "beq 2f \n\t" + + "1: \n\t" + "vld1.32 {q0}, [%[src]]! \n\t" + "vmov q1, q0 \n\t" + "subs %[n], %[n], #1 \n\t" + "vst2.32 {q0,q1}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static inline void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) { + int i = n & 1; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #1 \n\t" + "beq 2f \n\t" + + "1: \n\t" + "ldm %[src]!, {r4,r6} \n\t" + "mov r5, r4 \n\t" + "mov r7, r6 \n\t" + "subs %[n], %[n], #1 \n\t" + "stm %[dst]!, {r4-r7} \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "r4", "r5", "r6", "r7" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static inline void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 7; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #3 \n\t" + "beq 2f \n\t" + + "1: \n\t" + "vld1.16 {q0}, [%[src]]! \n\t" + "vmov q1, q0 \n\t" + "subs %[n], %[n], #1 \n\t" + "vst2.16 {q0,q1}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + mono_to_stereo_float_neon_a9(dst, src, n); + break; + case PA_SAMPLE_S16NE: + mono_to_stereo_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + mono_to_stereo_float_neon_a8(dst, src, n); + break; + case PA_SAMPLE_S16NE: + mono_to_stereo_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static inline void mono_to_ch4_float_neon(float *dst, const float *src, unsigned n) { + int i = n & 1; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #1 \n\t" + "beq 2f \n\t" + + "1: \n\t" + "vld1.32 {d0}, [%[src]]! \n\t" + "vdup.f32 q1, d0[0] \n\t" + "vdup.f32 q2, d0[1] \n\t" + "subs %[n], %[n], #1 \n\t" + "vst1.32 {q1,q2}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "q0", "q1", "q2" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = dst[2] = dst[3] = src[0]; + src++; + dst += 4; + } +} + +static inline void mono_to_ch4_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 3; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #2 \n\t" + "beq 2f \n\t" + + "1: \n\t" + "vld1.16 {d0}, [%[src]]! \n\t" + "vdup.s16 d1, d0[1] \n\t" + "vdup.s16 d2, d0[2] \n\t" + "vdup.s16 d3, d0[3] \n\t" + "vdup.s16 d0, d0[0] \n\t" + "subs %[n], %[n], #1 \n\t" + "vst1.16 {d0,d1,d2,d3}, [%[dst]]!\n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "d0", "d1", "d2", "d3" /* clobber list */ + ); + + while (i--) { + dst[0] = dst[1] = dst[2] = dst[3] = src[0]; + src++; + dst += 4; + } +} + +static void remap_mono_to_ch4_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + mono_to_ch4_float_neon(dst, src, n); + break; + case PA_SAMPLE_S16NE: + mono_to_ch4_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static inline void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) { + int i = n & 3; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #2 \n\t" + "beq 2f \n\t" + + "vdup.f32 q2, %[halve] \n\t" + + "1: \n\t" + "vld2.32 {q0,q1}, [%[src]]! \n\t" + "vadd.f32 q0, q0, q1 \n\t" + "vmul.f32 q0, q0, q2 \n\t" + "subs %[n], %[n], #1 \n\t" + "vst1.32 {q0}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : [halve] "r" (0.5f) /* input operands */ + : "memory", "cc", "q0", "q1", "q2" /* clobber list */ + ); + + while (i--) { + dst[0] = (src[0] + src[1])*0.5f; + src += 2; + dst++; + } +} + +static inline void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 7; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #3 \n\t" + "beq 2f \n\t" + + "1:\n\t" + "vld2.16 {q0,q1}, [%[src]]! \n\t" + "vrhadd.s16 q0, q0, q1 \n\t" + "subs %[n], %[n], #1 \n\t" + "vst1.16 {q0}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); + + while (i--) { + dst[0] = (src[0] + src[1])/2; + src += 2; + dst++; + } +} + +static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + stereo_to_mono_float_neon(dst, src, n); + break; + case PA_SAMPLE_S16NE: + stereo_to_mono_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static inline void ch4_to_mono_float_neon(float *dst, const float *src, unsigned n) { + int i = n & 1; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #1 \n\t" + "beq 2f \n\t" + + "vdup.f32 d4, %[quart] \n\t" + + "1:\n\t" + "vld4.32 {d0,d1,d2,d3}, [%[src]]!\n\t" + "vadd.f32 d0, d0, d1 \n\t" + "vadd.f32 d2, d2, d3 \n\t" + "vadd.f32 d0, d0, d2 \n\t" + "vmul.f32 d0, d0, d4 \n\t" + "subs %[n], %[n], #1 \n\t" + "vst1.32 {d0}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : [quart] "r" (0.25f) /* input operands */ + : "memory", "cc", "d0", "d1", "d2", "d3", "d4" /* clobber list */ + ); + + while (i--) { + dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f; + src += 4; + dst++; + } +} + +static inline void ch4_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 3; + + __asm__ __volatile__ ( + "movs %[n], %[n], lsr #2 \n\t" + "beq 2f \n\t" + + "1:\n\t" + "vld4.16 {d0,d1,d2,d3}, [%[src]]!\n\t" + "vrhadd.s16 d0, d0, d1 \n\t" + "vrhadd.s16 d2, d2, d3 \n\t" + "vrhadd.s16 d0, d0, d2 \n\t" + "subs %[n], %[n], #1 \n\t" + "vst1.16 {d0}, [%[dst]]! \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ + : /* input operands */ + : "memory", "cc", "d0", "d1", "d2", "d3" /* clobber list */ + ); + + while (i--) { + dst[0] = (src[0] + src[1] + src[2] + src[3])/4; + src += 4; + dst++; + } +} + +static void remap_ch4_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + ch4_to_mono_float_neon(dst, src, n); + break; + case PA_SAMPLE_S16NE: + ch4_to_mono_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} + +static pa_cpu_arm_flag_t arm_flags; + +static void init_remap_neon(pa_remap_t *m) { + unsigned n_oc, n_ic; + + n_oc = m->o_ss->channels; + n_ic = m->i_ss->channels; + + /* find some common channel remappings, fall back to full matrix operation. */ + if (n_ic == 1 && n_oc == 2 && + m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM) { + if (arm_flags & PA_CPU_ARM_CORTEX_A8) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a8; + pa_log_info("Using ARM NEON/A8 mono to stereo remapping"); + } + else { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon_a9; + pa_log_info("Using ARM NEON/A9 mono to stereo remapping"); + } + } else if (n_ic == 1 && n_oc == 4 && + m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM && + m->map_table_i[2][0] == PA_VOLUME_NORM && m->map_table_i[3][0] == PA_VOLUME_NORM) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_ch4_neon; + pa_log_info("Using ARM NEON mono to 4-channel remapping"); + } else if (n_ic == 2 && n_oc == 1 && + m->map_table_i[0][0] == PA_VOLUME_HALF && m->map_table_i[0][1] == PA_VOLUME_HALF) { + m->do_remap = (pa_do_remap_func_t) remap_stereo_to_mono_neon; + pa_log_info("Using ARM NEON stereo to mono remapping"); + } else if (n_ic == 4 && n_oc == 1 && + m->map_table_i[0][0] == PA_VOLUME_QUARTER && m->map_table_i[0][1] == PA_VOLUME_QUARTER && + m->map_table_i[0][2] == PA_VOLUME_QUARTER && m->map_table_i[0][3] == PA_VOLUME_QUARTER) { + m->do_remap = (pa_do_remap_func_t) remap_ch4_to_mono_neon; + pa_log_info("Using ARM NEON 4-channel to mono remapping"); + } +} + +void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) { + pa_log_info("Initialising ARM NEON optimized remappers."); + arm_flags = flags; + pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon); +} -- 1.7.9.5