[PATCH 1/3] WIP sconv: fix ARM NEON s16_to_float conversion

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx>

the C s16_to_float performs
flt = sample / (float) 0x7fff

floating point division is expensive and the obvious solution is to
multiply by the inverse:
flt = sample * (1.0f / 0x7fff)

however, the results differ slightly for 1536 input values

this patch checks for input values that would produce a mismatch and
corrects the output accordingly

Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>
---
 src/pulsecore/sconv_neon.c |   23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c
index fd45965..40312b0 100644
--- a/src/pulsecore/sconv_neon.c
+++ b/src/pulsecore/sconv_neon.c
@@ -75,16 +75,23 @@ static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *
     const float invscale = 1.0f / 0x7FFF;
 
     __asm__ __volatile__ (
-        "movs        %[n], %[n], lsr #2     \n\t"
+        "movs       %[n], %[n], lsr #2      \n\t"
         "beq        2f                      \n\t"
 
         "vdup.f32   q1, %[invscale]         \n\t"
+        "vdup.u16   q3, %[mask]             \n\t"
+        "vdup.u32   q4, %[one]              \n\t"
 
         "1:                                 \n\t"
-        "vld1.16    {d0}, [%[src]]!         \n\t"
-        "vmovl.s16  q0, d0                  \n\t"
-        "vcvt.f32.s32 q0, q0                \n\t"
-        "vmul.f32   q0, q0, q1              \n\t"
+        "vld1.16    {d0}, [%[src]]!         \n\t" /* load x */
+        "vmovl.s16  q0, d0                  \n\t" /* s16 -> s32 */
+        "vcvt.f32.s32 q0, q0                \n\t" /* s32 -> float */
+
+        "vceq.u16   q2, q0, q3              \n\t" /* check for defect */
+        "vand.u32   q2, q2, q4              \n\t" /* prepare 1 if defect */
+
+        "vmul.f32   q0, q0, q1              \n\t" /* multiply by invscale */
+        "vadd.u32   q0, q0, q2              \n\t" /* correct if defect */
         "subs       %[n], %[n], #1          \n\t"
         "vst1.32    {q0}, [%[dst]]!         \n\t"
         "bgt        1b                      \n\t"
@@ -92,13 +99,13 @@ static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *
         "2:                                 \n\t"
 
         : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) /* output operands (or input operands that get modified) */
-        : [invscale] "r" (invscale) /* input operands */
-        : "memory", "cc", "q0", "q1" /* clobber list */
+        : [invscale] "r" (invscale), [mask] "r" (0x4000), [one] "r" (1) /* input operands */
+        : "memory", "cc", "q0", "q1", "q2", "q3", "q4" /* clobber list */
     );
 
     /* leftovers */
     while (i--) {
-        *dst++ = *src++ * invscale;
+        *dst++ = *src++ / (float) 0x7fff;
     }
 }
 
-- 
1.7.9.5



[Index of Archives]     [Linux Audio Users]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux