Hello, Benchmark of a loop doing './sbcenc big_buck_bunny_480p_stereo.au > /dev/null' is listed below. The number of iterations was different for ARM and x86, so absolute times for x86 and ARM can't be directly compared :) === ARM Cortex-A8 === before: real 0m 12.51s user 0m 11.00s sys 0m 0.58s after: real 0m 11.50s user 0m 10.04s sys 0m 0.54s === Intel Core2 === before: real 0m12.139s user 0m11.817s sys 0m0.300s after: real 0m10.074s user 0m9.797s sys 0m0.256s === Overall improvement is more visible on x86, but it is still good for ARM too. Code size increases quite noticeably, but this seems to pay off anyway. -- Best regards, Siarhei Siamashka
>From 91d800ecc5f48692588ff908ca212d70eef5d0d3 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka <siarhei.siamashka@xxxxxxxxx> Date: Wed, 21 Jan 2009 21:08:34 +0200 Subject: [PATCH] Use of -funroll-loops option to improve SBC encoder performance Added the use of -funroll-loops gcc option for SBC. Also in order to gain better effect, 'sbc_pack_frame' function body moved to an inline function, which gets instantiated for 4 different subbands/channels combinations. So that 'frame_subbands' and 'frame_channels' arguments become compile time constants and can be better optimized by the compiler. --- sbc/Makefile.am | 3 ++- sbc/sbc.c | 49 +++++++++++++++++++++++++++++++++---------------- sbc/sbc_primitives.h | 6 ++++++ 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/sbc/Makefile.am b/sbc/Makefile.am index d4ad194..3feb178 100644 --- a/sbc/Makefile.am +++ b/sbc/Makefile.am @@ -12,7 +12,8 @@ libsbc_la_SOURCES = sbc.h sbc.c sbc_math.h sbc_tables.h \ sbc_primitives.h sbc_primitives_mmx.h sbc_primitives_neon.h \ sbc_primitives.c sbc_primitives_mmx.c sbc_primitives_neon.c -libsbc_la_CFLAGS = -finline-functions -funswitch-loops -fgcse-after-reload +libsbc_la_CFLAGS = -finline-functions -funswitch-loops -fgcse-after-reload \ + -funroll-loops noinst_PROGRAMS = sbcinfo sbcdec sbcenc $(sndfile_programs) diff --git a/sbc/sbc.c b/sbc/sbc.c index 827b731..190ac17 100644 --- a/sbc/sbc.c +++ b/sbc/sbc.c @@ -731,7 +731,9 @@ static int sbc_analyze_audio(struct sbc_encoder_state *state, * -99 not implemented */ -static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) +static SBC_ALWAYS_INLINE int sbc_pack_frame_internal( + uint8_t *data, struct sbc_frame *frame, size_t len, + int frame_subbands, int frame_channels) { /* Bitstream writer starts from the fourth byte */ uint8_t *data_ptr = data + 4; @@ -761,7 +763,7 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) data[1] |= (frame->allocation & 0x01) << 1; - switch (frame->subbands) { + switch (frame_subbands) { case 4: /* Nothing to do */ break; @@ -776,11 +778,11 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) data[2] = frame->bitpool; if ((frame->mode == MONO || frame->mode == DUAL_CHANNEL) && - frame->bitpool > frame->subbands << 4) + frame->bitpool > frame_subbands << 4) return -5; if ((frame->mode == STEREO || frame->mode == JOINT_STEREO) && - frame->bitpool > frame->subbands << 5) + frame->bitpool > frame_subbands << 5) return -5; /* Can't fill in crc yet */ @@ -789,8 +791,8 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) crc_header[1] = data[2]; crc_pos = 16; - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { frame->scale_factor[ch][sb] = 0; scalefactor[ch][sb] = 2 << SCALE_OUT_BITS; for (blk = 0; blk < frame->blocks; blk++) { @@ -812,7 +814,7 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) uint8_t joint = 0; frame->joint = 0; - for (sb = 0; sb < frame->subbands - 1; sb++) { + for (sb = 0; sb < frame_subbands - 1; sb++) { scale_factor_j[0] = 0; scalefactor_j[0] = 2 << SCALE_OUT_BITS; scale_factor_j[1] = 0; @@ -844,7 +846,7 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) (scale_factor_j[0] + scale_factor_j[1])) { /* use joint stereo for this subband */ - joint |= 1 << (frame->subbands - 1 - sb); + joint |= 1 << (frame_subbands - 1 - sb); frame->joint |= 1 << sb; frame->scale_factor[0][sb] = scale_factor_j[0]; frame->scale_factor[1][sb] = scale_factor_j[1]; @@ -858,13 +860,13 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) } PUT_BITS(data_ptr, bits_cache, bits_count, - joint, frame->subbands); + joint, frame_subbands); crc_header[crc_pos >> 3] = joint; - crc_pos += frame->subbands; + crc_pos += frame_subbands; } - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { PUT_BITS(data_ptr, bits_cache, bits_count, frame->scale_factor[ch][sb] & 0x0F, 4); crc_header[crc_pos >> 3] <<= 4; @@ -881,8 +883,8 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) sbc_calculate_bits(frame, bits); - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { levels[ch][sb] = ((1 << bits[ch][sb]) - 1) << (32 - (frame->scale_factor[ch][sb] + SCALE_OUT_BITS + 2)); @@ -893,8 +895,8 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) } for (blk = 0; blk < frame->blocks; blk++) { - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { if (bits[ch][sb] == 0) continue; @@ -914,6 +916,21 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) return data_ptr - data; } +static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) +{ + if (frame->subbands == 4) { + if (frame->channels == 1) + return sbc_pack_frame_internal(data, frame, len, 4, 1); + else + return sbc_pack_frame_internal(data, frame, len, 4, 2); + } else { + if (frame->channels == 1) + return sbc_pack_frame_internal(data, frame, len, 8, 1); + else + return sbc_pack_frame_internal(data, frame, len, 8, 2); + } +} + static void sbc_encoder_init(struct sbc_encoder_state *state, const struct sbc_frame *frame) { diff --git a/sbc/sbc_primitives.h b/sbc/sbc_primitives.h index 91b72ee..a418ed8 100644 --- a/sbc/sbc_primitives.h +++ b/sbc/sbc_primitives.h @@ -28,6 +28,12 @@ #define SCALE_OUT_BITS 15 +#ifdef __GNUC__ +#define SBC_ALWAYS_INLINE __attribute__((always_inline)) +#else +#define SBC_ALWAYS_INLINE inline +#endif + struct sbc_encoder_state { int subbands; int position[2]; -- 1.5.6.5