From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> v2: * load and store data with vld1/vld1q and vst1/vst1a, resp., to work around alignment issues of compiler-generated vldmia instruction * remove redundant check for NEON flags Ubuntu/Linaro gcc 4.6.1 arm-linux-gnueabi-gcc-O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon runtime on beagle-xm, 800 MHz checking NEON sconv_s16le_from_float(1020) NEON: 3510 usec. ref: 60731 usec. checking NEON sconv_s16le_to_float(1020) NEON: 1800 usec. ref: 10254 usec. --- src/Makefile.am | 2 +- src/pulsecore/sconv_neon.c | 188 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletions(-) create mode 100644 src/pulsecore/sconv_neon.c diff --git a/src/Makefile.am b/src/Makefile.am index a6f9640..497618a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -826,7 +826,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \ pulsecore/svolume_neon.c \ pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \ pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \ - pulsecore/sconv_sse.c \ + pulsecore/sconv_sse.c pulsecore/sconv_neon.c \ pulsecore/sconv.c pulsecore/sconv.h \ pulsecore/shared.c pulsecore/shared.h \ pulsecore/sink-input.c pulsecore/sink-input.h \ diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c new file mode 100644 index 0000000..c21d8e3 --- /dev/null +++ b/src/pulsecore/sconv_neon.c @@ -0,0 +1,188 @@ +/*** + This file is part of PulseAudio. + + Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/rtclock.h> + +#include <pulsecore/macro.h> +#include <pulsecore/endianmacros.h> + +#include "cpu-arm.h" +#include "sconv.h" + +#if defined(__ARM_NEON__) + +#include <math.h> +#include <arm_neon.h> + +#define RUN_TEST + +static void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) { + unsigned i; + + const float32x4_t plusone4 = vdupq_n_f32(1.0f); + const float32x4_t minusone4 = vdupq_n_f32(-1.0f); + const float32x4_t half4 = vdupq_n_f32(0.5f); + const float32x4_t scale4 = vdupq_n_f32(32767.0f); + const uint32x4_t mask4 = vdupq_n_u32(0x80000000); + + for (i = 0; i < (n & ~3); i += 4) { + const float32x4_t v4 = + vmulq_f32(vmaxq_f32(vminq_f32(vld1q_f32(&a[i]), plusone4) , minusone4), scale4); + + const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32( + vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4))); + + vst1_s16(&b[i], vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)))); + } + + // leftovers + for ( ; i < n; i++) { + b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF); + } +} + +static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) { + unsigned i; + const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF); + const float invscale = 1.0f / 0x7FFF; + + for (i = 0; i < (n & ~3); i += 4) { + int16x4_t v4 = vld1_s16(&a[i]); + vst1q_f32(&b[i], vmulq_f32(vcvtq_f32_s32(vmovl_s16(v4)), invscale4)); + } + + // leftovers + for ( ; i < n; i++) { + b[i] = a[i] * invscale; + } +} + +#ifdef RUN_TEST +#define SAMPLES 1019 +#define TIMES 300 + +static void run_test_from(void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + float floats[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_from_float"); + + memset(samples_ref, 0, sizeof(samples_ref)); + memset(samples, 0, sizeof(samples)); + + for (i = 0; i < SAMPLES; i++) { + floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); + } + + func = (pa_convert_func_t) pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE); + func(SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + + for (i = 0; i < SAMPLES; i++) { + if (abs(samples[i] - samples_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], + floats[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, floats, samples_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_to(void) { + int16_t samples[SAMPLES]; + float floats[SAMPLES]; + float floats_ref[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_to_float"); + + memset(floats_ref, 0, sizeof(floats_ref)); + memset(floats, 0, sizeof(float)); + + for (i = 0; i < SAMPLES; i++) { + samples[i] = rand() - RAND_MAX/2; + } + + func = (pa_convert_func_t) pa_get_convert_to_float32ne_function(PA_SAMPLE_S16LE); + func(SAMPLES, samples, floats_ref); + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { + pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i], + samples[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, samples, floats_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif /* RUN_TEST */ + +#endif /* defined(__ARM_NEON__) */ + +void pa_convert_func_init_neon(pa_cpu_arm_flag_t flags) { +#if defined (__ARM_NEON__) + +#ifdef RUN_TEST + run_test_from(); + run_test_to(); +#endif + + pa_log_info("Initialising ARM NEON optimized conversions."); + pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_neon); + pa_set_convert_to_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_to_f32ne_neon); + +#endif /* defined (__ARM_NEON__) */ +} -- 1.7.4.1