From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> v4: * fix for sample length < 4 v3: * convert from intrinsics to inline assembly v2: * load and store data with vld1/vld1q and vst1/vst1q, resp., to work around alignment issues of compiler-generated vldmia instruction * remove redundant check for NEON flags Ubuntu/Linaro gcc 4.6.3 arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon runtime on beagle-xm: D: [pulseaudio] sconv_neon.c: checking NEON sconv_s16le_from_float I: [pulseaudio] sconv_neon.c: NEON: 3754 usec. I: [pulseaudio] sconv_neon.c: ref: 58594 usec. D: [pulseaudio] sconv_neon.c: checking NEON sconv_s16le_to_float I: [pulseaudio] sconv_neon.c: NEON: 1831 usec. I: [pulseaudio] sconv_neon.c: ref: 10528 usec. I: [pulseaudio] sconv_neon.c: Initialising ARM NEON optimized conversions. conversion may be off by one for some samples due to rounding issues Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com> --- src/Makefile.am | 2 +- src/pulsecore/sconv_neon.c | 213 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 src/pulsecore/sconv_neon.c diff --git a/src/Makefile.am b/src/Makefile.am index 6212c74..0ca1ec2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -850,7 +850,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \ pulsecore/svolume_mmx.c pulsecore/svolume_sse.c \ pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \ pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \ - pulsecore/sconv_sse.c \ + pulsecore/sconv_sse.c pulsecore/sconv_neon.c \ pulsecore/sconv.c pulsecore/sconv.h \ pulsecore/shared.c pulsecore/shared.h \ pulsecore/sink-input.c pulsecore/sink-input.h \ diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c new file mode 100644 index 0000000..96e91ca --- /dev/null +++ b/src/pulsecore/sconv_neon.c @@ -0,0 +1,213 @@ +/*** + This file is part of PulseAudio. + + Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/rtclock.h> + +#include <pulsecore/macro.h> +#include <pulsecore/endianmacros.h> + +#include "cpu-arm.h" +#include "sconv.h" + +#if defined(__ARM_NEON__) + +#include <math.h> +#include <arm_neon.h> + +#define RUN_TEST + +static void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) { + unsigned i = n & 3; + + asm volatile ( + "movs %[n], %[n], lsr #2\n\t" + "beq 2f\n\t" + "vdup.f32 q2, %[plusone]\n\t" + "vneg.f32 q3, q2\n\t" + "vdup.f32 q4, %[scale]\n\t" + "vdup.u32 q5, %[mask]\n\t" + "vdup.f32 q6, %[half]\n\t" + "1:\n\t" + "vld1.32 {q0}, [%[src]]!\n\t" + "vmin.f32 q0, q0, q2\n\t" /* clamp */ + "vmax.f32 q0, q0, q3\n\t" + "vmul.f32 q0, q0, q4\n\t" /* scale */ + "vand.u32 q1, q0, q5\n\t" + "vorr.u32 q1, q1, q6\n\t" /* round */ + "vadd.f32 q0, q0, q1\n\t" + "vcvt.s32.f32 q0, q0\n\t" /* narrow */ + "vmovn.i32 d0, q0\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.16 {d0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + "2:\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : [plusone] "r" (1.0f), [scale] "r" (32767.0f), + [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */ + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */ + ); + + // leftovers + while (i--) { + *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF); + src++; + } +} + +static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) { + unsigned i = n & 3; + + const float invscale = 1.0f / 0x7FFF; + + asm volatile ( + "movs %[n], %[n], lsr #2\n\t" + "beq 2f\n\t" + "vdup.f32 q1, %[invscale]\n\t" + "1:\n\t" + "vld1.16 {d0}, [%[src]]!\n\t" + "vmovl.s16 q0, d0\n\t" + "vcvt.f32.s32 q0, q0\n\t" + "vmul.f32 q0, q0, q1\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.32 {q0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + "2:\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : [invscale] "r" (invscale) /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); + + // leftovers + while (i--) { + *dst++ = *src++ * invscale; + } +} + +#ifdef RUN_TEST +#define SAMPLES 1019 +#define TIMES 300 + +static void run_test_from(void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + float floats[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_from_float"); + + memset(samples_ref, 0, sizeof(samples_ref)); + memset(samples, 0, sizeof(samples)); + + for (i = 0; i < SAMPLES; i++) { + floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); + } + + func = (pa_convert_func_t) pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE); + func(SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + + for (i = 0; i < SAMPLES; i++) { + if (abs(samples[i] - samples_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], + floats[i]); + break; + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, floats, samples_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_to(void) { + int16_t samples[SAMPLES]; + float floats[SAMPLES]; + float floats_ref[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_to_float"); + + memset(floats_ref, 0, sizeof(floats_ref)); + memset(floats, 0, sizeof(float)); + + for (i = 0; i < SAMPLES; i++) { + samples[i] = rand() - RAND_MAX/2; + } + + func = (pa_convert_func_t) pa_get_convert_to_float32ne_function(PA_SAMPLE_S16LE); + func(SAMPLES, samples, floats_ref); + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { + pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i], + samples[i]); + break; + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, samples, floats_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif /* RUN_TEST */ + +#endif /* defined(__ARM_NEON__) */ + +void pa_convert_func_init_neon(pa_cpu_arm_flag_t flags) { +#if defined (__ARM_NEON__) + +#ifdef RUN_TEST + run_test_from(); + run_test_to(); +#endif + + pa_log_info("Initialising ARM NEON optimized conversions."); + pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_neon); + pa_set_convert_to_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_to_f32ne_neon); + +#endif /* defined (__ARM_NEON__) */ +} -- 1.7.9.5