1) convert default processing functions to __attribute__((weak)) so they can be overrided with architecture specific accelerated functions (ie. NEON, MMX, Altivec, etc) 2) override gst_audio_quantize_quantize_signed_tpdf_none() to use NEON vector instructions 3) override gst_audio_convert_unpack_float_le() to use NEON vector instructions This speeds up audioconvert ~10x, at least for the 32b float -> 16b int conversion needed to play AC-3 audio (ie. DVD's) via ALSA. --- gst/audioconvert/Makefile.am | 1 + gst/audioconvert/armv7.c | 209 +++++++++++++++++++++++++++++++++++ gst/audioconvert/audioconvert.c | 20 ++-- gst/audioconvert/gstaudioquantize.c | 4 +- gst/audioconvert/gstchannelmix.c | 4 +- 5 files changed, 224 insertions(+), 14 deletions(-) create mode 100644 gst/audioconvert/armv7.c diff --git a/gst/audioconvert/Makefile.am b/gst/audioconvert/Makefile.am index 94978bb..2d273db 100644 --- a/gst/audioconvert/Makefile.am +++ b/gst/audioconvert/Makefile.am @@ -5,6 +5,7 @@ libgstaudioconvert_la_SOURCES = \ audioconvert.c \ gstchannelmix.c \ gstaudioquantize.c \ + armv7.c \ plugin.c libgstaudioconvert_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) diff --git a/gst/audioconvert/armv7.c b/gst/audioconvert/armv7.c new file mode 100644 index 0000000..e39d29d --- /dev/null +++ b/gst/audioconvert/armv7.c @@ -0,0 +1,209 @@ +/* GStreamer + * + * Copyright (C) 2009 Texas Instruments, Inc - http://www.ti.com/ + * + * Description: NEON/VFP accelerated functions for armv7 architecture + * Created on: Aug 8, 2009 + * Author: Rob Clark <rob at ti.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef __ARM_NEON__ +#include <arm_neon.h> +#include <string.h> + +#include "audioconvert.h" + + +void +gst_audio_quantize_quantize_signed_tpdf_none (AudioConvertCtx *ctx, + gint32 *src, gint32 *dst, gint count) +{ + static guint32 state[4] = { + 0xdeadbeef, + 0x305b8cc9, + 0x6c46ec93, + 0xad13b0cd + }; + + gint scale = ctx->out_scale; + count *= ctx->out.channels; + + if (scale > 0) { + guint32 mask = 0xffffffff & (0xffffffff << scale); + guint32 bias = (1U << (scale - 1)) >> 1; + gint32 dither = (1<<(scale - 1)); + + int32x4_t vrand; + uint32x4_t vstate; + uint32x4_t v12345; + int32x4_t vtmp; + uint32x4_t vmask; + + vstate = vld1q_u32 (state); + v12345 = vmovq_n_u32 (12345); + vmask = vmovq_n_u32 (mask); + + /* until we have less 4 words less to process, use vector instructions + * to do everything 4x at a time: + */ + for (;;count-=4) { + int64x2_t vtmp_lo; + int64x2_t vtmp_hi; + uint32x4_t vstate2; + int32x2_t vrand_lo; + int32x2_t vrand_hi; + + /* generate next eight random words: (see gst_fast_random_uint32()) + * + * state = state * 1103515245 + 12345 + */ + vstate2 = vmulq_n_u32 (vstate, 1103515245); + vstate2 = vaddq_u32 (vstate2, v12345); + vstate = vmulq_n_u32 (vstate2, 1103515245); + vstate = vaddq_u32 (vstate2, v12345); + + /* generate next four scaled random values: + * + * gint32 start = bias - dither; + * gint32 end = bias + dither - 1; + * gint64 tmp1 = gst_fast_random_uint32 (); + * gint64 tmp2 = gst_fast_random_uint32 (); + * rand = (gint32)(((tmp1+tmp2) * (end - start)) / (1LLU<<32) + start); + * + * need to split vstate and vstate2 into 2*2 int64x2_t and add.... + */ + vstate2 = vaddq_u32 (vstate, vstate2); /* tmp1+tmp2 */ + vtmp_lo = vreinterpretq_s64_u64 ( /* * (end-start) */ + vmull_n_u32 (vget_low_u32 (vstate2), (2*dither) - 1)); + vtmp_hi = vreinterpretq_s64_u64 ( /* * (end-start) */ + vmull_n_u32 (vget_high_u32 (vstate2), (2*dither) - 1)); + + vtmp_lo = vshrq_n_s64 (vtmp_lo, 32); /* / (1LLU<<32) */ + vtmp_hi = vshrq_n_s64 (vtmp_hi, 32); /* / (1LLU<<32) */ + + + /* now want to put vtmp_hi and vtmp_lo back together.. + * then add 'start' (bias-dither).. which is negative.. + */ + vrand_lo = vmovn_s64 (vtmp_lo); + vrand_hi = vmovn_s64 (vtmp_hi); + vrand = vcombine_s32 (vrand_lo, vrand_hi); + vrand = vaddq_s32 (vrand, vmovq_n_s32 (bias-dither)); + + /* load next 4 words: + */ + vtmp = vld1q_s32 (src); + src += 4; + + /* perform saturating add of random noise... we don't want the + * value to wrap around: + * + * XXX I *think* vqaddq will handle saturation for underflow too.. + */ + vtmp = vqaddq_s32 (vtmp, vrand); + vtmp = vreinterpretq_s32_u32 ( + vandq_u32 (vreinterpretq_u32_s32 (vtmp), vmask)); + + /* we check for less than four remaining words at the end, before + * we store the result back.. the assumption is that it shouldn't + * cause a segfault to read past the end of 'src', and there is no + * harm in processing a few garbage words. But we definitely don't + * want to write past the end of 'dst' + */ + if (count<4) break; + + /* store 4 words to result: + */ + vst1q_s32 (dst, vtmp); + dst += 4; + } + + vst1q_u32 (state, vstate); + + /* at this point, we could have 0-3 result bytes in vtmp to write + * back out to 'dst': + */ + if (count) { + gint32 tmpdst[4]; + gint32 *tmpp = tmpdst; + + vst1q_s32 (tmpdst, vtmp); + + while (count--) { + *dst++ = *tmpp++; + } + } + + } else { + memmove (dst, src, count); + } +} + +void +gst_audio_convert_unpack_float_le (gfloat * src, gint32 * dst, gint s, gint count) +{ + float32x4_t vsrc; + float32x4_t v05; + int32x4_t vdst; + + v05 = vmovq_n_f32 (0.5); + + for (;;count-=4) { + + /* load next 4 words: + */ + vsrc = vld1q_f32 ((float32_t *)src); + src += 4; + + /* convert to int: + */ + vsrc = vmulq_n_f32 (vsrc, 2147483647.0); + vsrc = vaddq_f32 (vsrc, v05); + vdst = vcvtq_s32_f32 (vsrc); + + /* we check for less than four remaining words at the end, before + * we store the result back.. the assumption is that it shouldn't + * cause a segfault to read past the end of 'src', and there is no + * harm in processing a few garbage words. But we definitely don't + * want to write past the end of 'dst' + */ + if (count<4) break; + + /* store 4 words to result: + */ + vst1q_s32 (dst, vdst); + dst += 4; + } + + /* at this point, we could have 0-3 result bytes in vtmp to write + * back out to 'dst': + */ + if (count) { + gint32 tmpdst[4]; + gint32 *tmpp = tmpdst; + + vst1q_s32 (tmpdst, vdst); + + while (count--) { + *dst++ = *tmpp++; + } + } +} + + +#endif diff --git a/gst/audioconvert/audioconvert.c b/gst/audioconvert/audioconvert.c index 4780324..c18d217 100644 --- a/gst/audioconvert/audioconvert.c +++ b/gst/audioconvert/audioconvert.c @@ -38,11 +38,11 @@ * unpack code */ #define MAKE_UNPACK_FUNC_NAME(name) \ -audio_convert_unpack_##name +gst_audio_convert_unpack_##name /* unpack from integer to signed integer 32 */ #define MAKE_UNPACK_FUNC_II(name, stride, sign, READ_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst, \ gint scale, gint count) \ { \ @@ -54,7 +54,7 @@ MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst, \ /* unpack from float to signed integer 32 */ #define MAKE_UNPACK_FUNC_FI(name, type, READ_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count) \ { \ gdouble temp; \ @@ -68,7 +68,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count) \ /* unpack from float to float 64 (double) */ #define MAKE_UNPACK_FUNC_FF(name, type, FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s, \ gint count) \ { \ @@ -78,7 +78,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s, \ /* unpack from int to float 64 (double) */ #define MAKE_UNPACK_FUNC_IF(name, stride, sign, READ_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (guint8 * src, gdouble * dst, gint scale, \ gint count) \ { \ @@ -158,7 +158,7 @@ audio_convert_pack_##name /* pack from signed integer 32 to integer */ #define MAKE_PACK_FUNC_II(name, stride, sign, WRITE_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst, \ gint scale, gint count) \ { \ @@ -172,7 +172,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst, \ /* pack from signed integer 32 to float */ #define MAKE_PACK_FUNC_IF(name, type, FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale, \ gint count) \ { \ @@ -182,7 +182,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale, \ /* pack from float 64 (double) to float */ #define MAKE_PACK_FUNC_FF(name, type, FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s, \ gint count) \ { \ @@ -194,7 +194,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s, \ * the floats are already in the correct range. Only a cast is needed. */ #define MAKE_PACK_FUNC_FI_S(name, stride, WRITE_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \ gint count) \ { \ @@ -212,7 +212,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \ * and an addition of 2^(target_depth-1) to get in the correct unsigned * range. */ #define MAKE_PACK_FUNC_FI_U(name, stride, WRITE_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \ gint count) \ { \ diff --git a/gst/audioconvert/gstaudioquantize.c b/gst/audioconvert/gstaudioquantize.c index 2155397..be959c4 100644 --- a/gst/audioconvert/gstaudioquantize.c +++ b/gst/audioconvert/gstaudioquantize.c @@ -46,7 +46,7 @@ gst_audio_quantize_quantize_##name #define MAKE_QUANTIZE_FUNC_I(name, DITHER_INIT_FUNC, ADD_DITHER_FUNC, \ ROUND_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src, \ gint32 *dst, gint count) \ { \ @@ -86,7 +86,7 @@ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src, \ #define MAKE_QUANTIZE_FUNC_F(name, DITHER_INIT_FUNC, NS_INIT_FUNC, \ ADD_NS_FUNC, ADD_DITHER_FUNC, \ UPDATE_ERROR_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gdouble *src, \ gdouble *dst, gint count) \ { \ diff --git a/gst/audioconvert/gstchannelmix.c b/gst/audioconvert/gstchannelmix.c index 0f9b945..aac8957 100644 --- a/gst/audioconvert/gstchannelmix.c +++ b/gst/audioconvert/gstchannelmix.c @@ -659,7 +659,7 @@ gst_channel_mix_passthrough (AudioConvertCtx * this) /* IMPORTANT: out_data == in_data is possible, make sure to not overwrite data * you might need later on! */ -void +void __attribute__((weak)) gst_channel_mix_mix_int (AudioConvertCtx * this, gint32 * in_data, gint32 * out_data, gint samples) { @@ -698,7 +698,7 @@ gst_channel_mix_mix_int (AudioConvertCtx * this, } } -void +void __attribute__((weak)) gst_channel_mix_mix_float (AudioConvertCtx * this, gdouble * in_data, gdouble * out_data, gint samples) { -- 1.6.3.2