From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> the C s16_to_float performs flt = sample / (float) 0x7fff floating point division is expensive and the obvious solution is to multiply by the inverse: flt = sample * (1.0f / 0x7fff) however, the results differ slightly for 1536 input values this patch checks for input values that would produce a mismatch and corrects the output accordingly Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com> --- src/pulsecore/sconv_neon.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c index fd45965..40312b0 100644 --- a/src/pulsecore/sconv_neon.c +++ b/src/pulsecore/sconv_neon.c @@ -75,16 +75,23 @@ static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float * const float invscale = 1.0f / 0x7FFF; __asm__ __volatile__ ( - "movs %[n], %[n], lsr #2 \n\t" + "movs %[n], %[n], lsr #2 \n\t" "beq 2f \n\t" "vdup.f32 q1, %[invscale] \n\t" + "vdup.u16 q3, %[mask] \n\t" + "vdup.u32 q4, %[one] \n\t" "1: \n\t" - "vld1.16 {d0}, [%[src]]! \n\t" - "vmovl.s16 q0, d0 \n\t" - "vcvt.f32.s32 q0, q0 \n\t" - "vmul.f32 q0, q0, q1 \n\t" + "vld1.16 {d0}, [%[src]]! \n\t" /* load x */ + "vmovl.s16 q0, d0 \n\t" /* s16 -> s32 */ + "vcvt.f32.s32 q0, q0 \n\t" /* s32 -> float */ + + "vceq.u16 q2, q0, q3 \n\t" /* check for defect */ + "vand.u32 q2, q2, q4 \n\t" /* prepare 1 if defect */ + + "vmul.f32 q0, q0, q1 \n\t" /* multiply by invscale */ + "vadd.u32 q0, q0, q2 \n\t" /* correct if defect */ "subs %[n], %[n], #1 \n\t" "vst1.32 {q0}, [%[dst]]! \n\t" "bgt 1b \n\t" @@ -92,13 +99,13 @@ static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float * "2: \n\t" : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */ - : [invscale] "r" (invscale) /* input operands */ - : "memory", "cc", "q0", "q1" /* clobber list */ + : [invscale] "r" (invscale), [mask] "r" (0x4000), [one] "r" (1) /* input operands */ + : "memory", "cc", "q0", "q1", "q2", "q3", "q4" /* clobber list */ ); /* leftovers */ while (i--) { - *dst++ = *src++ * invscale; + *dst++ = *src++ / (float) 0x7fff; } } -- 1.7.9.5