From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> --- src/Makefile.am | 1 + src/pulsecore/remap_neon.c | 212 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+), 0 deletions(-) create mode 100644 src/pulsecore/remap_neon.c diff --git a/src/Makefile.am b/src/Makefile.am index 02635fa..9211ec5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -810,6 +810,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \ pulsecore/play-memchunk.c pulsecore/play-memchunk.h \ pulsecore/remap.c pulsecore/remap.h \ pulsecore/remap_mmx.c pulsecore/remap_sse.c \ + pulsecore/remap_neon.c \ pulsecore/resampler.c pulsecore/resampler.h \ pulsecore/rtpoll.c pulsecore/rtpoll.h \ pulsecore/sample-util.c pulsecore/sample-util.h \ diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c new file mode 100644 index 0000000..b6377b5 --- /dev/null +++ b/src/pulsecore/remap_neon.c @@ -0,0 +1,212 @@ +/*** + This file is part of PulseAudio. + + Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/rtclock.h> +#include <pulse/sample.h> +#include <pulsecore/log.h> +#include <pulsecore/macro.h> + +#include "cpu-arm.h" +#include "remap.h" + +#if defined(__ARM_NEON__) + +#include <math.h> +#include <arm_neon.h> + +static void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned i; + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d = (float *) dst, *s = (float *) src; + + for (i = 0; i < n/4; i++) { + float32x4x2_t stereo; + stereo.val[0] = vld1q_f32(s); + stereo.val[1] = stereo.val[0]; + vst2q_f32(d, stereo); + s += 4; + d += 8; + } + + for (i = n & ~3; i < n; i++) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d = (int16_t *) dst, *s = (int16_t *) src; + + for (i = 0; i < n/8; i++) { + int16x8x2_t stereo; + stereo.val[0] = vld1q_s16(s); + stereo.val[1] = stereo.val[0]; + vst2q_s16(d, stereo); + s += 8; + d += 16; + } + + for (i = n & ~7; i < n; i++) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + default: + pa_assert_not_reached(); + } +} + +#ifdef NO_TEST_YET_SINCE_HARD_TO_CALL_REFERENCE_IMPL + +#define SAMPLES 1019 +#define TIMES 1000 + +static void run_test_float(void) { + float stereo[2*SAMPLES]; + float stereo_ref[2*SAMPLES]; + float mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_remap_t remap; + + pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES); + + memset(stereo_ref, 0, sizeof(stereo_ref)); + memset(stereo, 0, sizeof(stereo)); + + for (i = 0; i < SAMPLES; i++) { + mono[i] = rand()/(float) RAND_MAX - 0.5f; + } + + sf = PA_SAMPLE_FLOAT32NE; + remap.format = &sf; + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + + for (i = 0; i < 2*SAMPLES; i++) { + if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i], + mono[i/2]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(&remap, stereo_ref, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_s16(void) { + int16_t stereo[2*SAMPLES]; + int16_t stereo_ref[2*SAMPLES]; + int16_t mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_remap_t remap; + + pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES); + + memset(stereo_ref, 0, sizeof(stereo_ref)); + memset(stereo, 0, sizeof(stereo)); + + for (i = 0; i < SAMPLES; i++) { + mono[i] = rand() - RAND_MAX/2; + } + + sf = PA_SAMPLE_S16NE; + remap.format = &sf; + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + + for (i = 0; i < 2*SAMPLES; i++) { + if (abs(stereo[i] - stereo_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i], + mono[i/2]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(&remap, stereo_ref, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif /* NO_TEST_YET_SINCE_HARD_TO_CALL_REFERENCE_IMPL */ + +static void init_remap_neon(pa_remap_t *m) { + unsigned n_oc, n_ic; + + n_oc = m->o_ss->channels; + n_ic = m->i_ss->channels; + + /* find some common channel remappings, fall back to full matrix operation. */ + if (n_ic == 1 && n_oc == 2 && + m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon; + pa_log_info("Using ARM NEON mono to stereo remapping"); + } +} +#endif /* defined (__ARM_NEON__) */ + +void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) { +#if defined (__ARM_NEON__) + +#ifdef RUN_TEST + run_test_float(); + run_test_s16(); +#endif + + if (flags & PA_CPU_ARM_NEON) { + pa_log_info("Initialising ARM NEON optimized remappers."); + pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon); + } + +#endif /* defined (__ARM_NEON__) */ +} -- 1.7.4.1