From: Peter Meerwald <p.meerwald@xxxxxxxxxxxxxxxxxx> The generic matrix remapping is rather inefficient; special-case code improves performance by 3x easily. v4: split into s16 and float code, 4-channel remapping v3: fix remap_mono_to_stereo_c(), use assignment v2: use consistent array addressing on Intel Core i7-870 @ 2.93 GHz (GCC 4.6, 64-bit): Checking special remap (float, mono->stereo) func: 70392 usec (avg: 703.92, min = 583, max = 1879, stddev = 295.192). orig: 193042 usec (avg: 1930.42, min = 1457, max = 2269, stddev = 89.9045). Checking special remap (float, mono->4-channel) func: 118408 usec (avg: 1184.08, min = 1151, max = 1454, stddev = 57.1244). orig: 380074 usec (avg: 3800.74, min = 3740, max = 4180, stddev = 96.3389). Checking special remap (s16, mono->stereo) func: 60574 usec (avg: 605.74, min = 582, max = 659, stddev = 20.7681). orig: 188262 usec (avg: 1882.62, min = 1804, max = 2167, stddev = 79.17). Checking special remap (s16, mono->4-channel) func: 120331 usec (avg: 1203.31, min = 1151, max = 1429, stddev = 55.2863). orig: 376028 usec (avg: 3760.28, min = 3609, max = 4096, stddev = 122.043). Checking special remap (float, stereo->mono) func: 61408 usec (avg: 614.08, min = 580, max = 867, stddev = 50.933). orig: 186484 usec (avg: 1864.84, min = 1808, max = 2121, stddev = 65.3967). Checking special remap (float, 4-channel->mono) func: 118101 usec (avg: 1181.01, min = 1157, max = 1383, stddev = 36.4474). orig: 365191 usec (avg: 3651.91, min = 3540, max = 4083, stddev = 117.509). Checking special remap (s16, stereo->mono) func: 82908 usec (avg: 829.08, min = 795, max = 953, stddev = 33.3409). orig: 182565 usec (avg: 1825.65, min = 1774, max = 2117, stddev = 65.5401). Checking special remap (s16, 4-channel->mono) func: 132025 usec (avg: 1320.25, min = 1284, max = 1509, stddev = 47.0133). orig: 363347 usec (avg: 3633.47, min = 3560, max = 4012, stddev = 111.259). on ARM Cortex-A8 (TI OMAP3 DM3730 @ 1GHz) (Linaro GCC 4.6): Checking special remap (float, mono->stereo) func: 1213562 usec (avg: 12135.6, min = 4669, max = 16266, stddev = 2067.64). orig: 9251927 usec (avg: 92519.3, min = 87372, max = 134216, stddev = 5965.79). Checking special remap (float, mono->4-channel) func: 2479550 usec (avg: 24795.5, min = 7507, max = 29358, stddev = 2690.16). orig: 13186133 usec (avg: 131861, min = 119843, max = 263855, stddev = 27309). Checking special remap (s16, mono->stereo) func: 471894 usec (avg: 4718.94, min = 4058, max = 9583, stddev = 1302.7). orig: 1673826 usec (avg: 16738.3, min = 14679, max = 31342, stddev = 2271.67). Checking special remap (s16, mono->4-channel) func: 869508 usec (avg: 8695.08, min = 7019, max = 19165, stddev = 1866.94). orig: 3317020 usec (avg: 33170.2, min = 29327, max = 47577, stddev = 2029.11). Checking special remap (float, stereo->mono) func: 4405182 usec (avg: 44051.8, min = 41443, max = 77912, stddev = 4160.54). orig: 13245064 usec (avg: 132451, min = 125244, max = 182282, stddev = 8543.93). Checking special remap (float, 4-channel->mono) func: 8607974 usec (avg: 86079.7, min = 81909, max = 116608, stddev = 4311.52). orig: 26326036 usec (avg: 263260, min = 255097, max = 312928, stddev = 10111.5). Checking special remap (s16, stereo->mono) func: 1209135 usec (avg: 12091.4, min = 10742, max = 16632, stddev = 1633.88). orig: 3081515 usec (avg: 30815.2, min = 27008, max = 50537, stddev = 3124.35). Checking special remap (s16, 4-channel->mono) func: 1653868 usec (avg: 16538.7, min = 14648, max = 20721, stddev = 1834.52). orig: 6017854 usec (avg: 60178.5, min = 56061, max = 89569, stddev = 4052.86). benchmark code will be posted as follow-up patches Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net> --- src/pulsecore/remap.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c index b2575c0..adff2a5 100644 --- a/src/pulsecore/remap.c +++ b/src/pulsecore/remap.c @@ -70,6 +70,114 @@ static void remap_mono_to_stereo_float32ne_c(pa_remap_t *m, float *dst, const fl } } +static void remap_stereo_to_mono_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { + unsigned i; + + for (i = n >> 2; i > 0; i--) { + dst[0] = (src[0] + src[1])/2; + dst[1] = (src[2] + src[3])/2; + dst[2] = (src[4] + src[5])/2; + dst[3] = (src[6] + src[7])/2; + src += 8; + dst += 4; + } + for (i = n & 3; i; i--) { + dst[0] = (src[0] + src[1])/2; + src += 2; + dst += 1; + } +} + +static void remap_stereo_to_mono_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) { + unsigned i; + + for (i = n >> 2; i > 0; i--) { + dst[0] = (src[0] + src[1])*0.5f; + dst[1] = (src[2] + src[3])*0.5f; + dst[2] = (src[4] + src[5])*0.5f; + dst[3] = (src[6] + src[7])*0.5f; + src += 8; + dst += 4; + } + for (i = n & 3; i; i--) { + dst[0] = (src[0] + src[1])*0.5f; + src += 2; + dst += 1; + } +} + +static void remap_mono_to_ch4_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { + unsigned i; + + for (i = n >> 2; i; i--) { + dst[0] = dst[1] = dst[2] = dst[3] = src[0]; + dst[4] = dst[5] = dst[6] = dst[7] = src[1]; + dst[8] = dst[9] = dst[10] = dst[11] = src[2]; + dst[12] = dst[13] = dst[14] = dst[15] = src[3]; + src += 4; + dst += 16; + } + for (i = n & 3; i; i--) { + dst[0] = dst[1] = dst[2] = dst[3] = src[0]; + src++; + dst += 4; + } +} + +static void remap_mono_to_ch4_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) { + unsigned i; + + for (i = n >> 2; i; i--) { + dst[0] = dst[1] = dst[2] = dst[3] = src[0]; + dst[4] = dst[5] = dst[6] = dst[7] = src[1]; + dst[8] = dst[9] = dst[10] = dst[11] = src[2]; + dst[12] = dst[13] = dst[14] = dst[15] = src[3]; + src += 4; + dst += 16; + } + for (i = n & 3; i; i--) { + dst[0] = dst[1] = dst[2] = dst[3] = src[0]; + src++; + dst += 4; + } +} + +static void remap_ch4_to_mono_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { + unsigned i; + + for (i = n >> 2; i > 0; i--) { + dst[0] = (src[0] + src[1] + src[2] + src[3])/4; + dst[1] = (src[4] + src[5] + src[6] + src[7])/4; + dst[2] = (src[8] + src[9] + src[10] + src[11])/4; + dst[3] = (src[12] + src[13] + src[14] + src[15])/4; + src += 16; + dst += 4; + } + for (i = n & 3; i; i--) { + dst[0] = (src[0] + src[1] + src[2] + src[3])/4; + src += 4; + dst += 1; + } +} + +static void remap_ch4_to_mono_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) { + unsigned i; + + for (i = n >> 2; i > 0; i--) { + dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f; + dst[1] = (src[4] + src[5] + src[6] + src[7])*0.25f; + dst[2] = (src[8] + src[9] + src[10] + src[11])*0.25f; + dst[3] = (src[12] + src[13] + src[14] + src[15])*0.25f; + src += 16; + dst += 4; + } + for (i = n & 3; i; i--) { + dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f; + src += 4; + dst += 1; + } +} + static void remap_channels_matrix_s16ne_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned oc, ic, i; unsigned n_ic, n_oc; @@ -249,6 +357,26 @@ static void init_remap_c(pa_remap_t *m) { pa_log_info("Using mono to stereo remapping"); pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_c, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_c); + } else if (n_ic == 2 && n_oc == 1 && + m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) { + + pa_log_info("Using stereo to mono remapping"); + pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_c, + (pa_do_remap_func_t)remap_stereo_to_mono_float32ne_c); + } else if (n_ic == 1 && n_oc == 4 && + m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 && + m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) { + + pa_log_info("Using mono to 4-channel remapping"); + pa_set_remap_func(m, (pa_do_remap_func_t)remap_mono_to_ch4_s16ne_c, + (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_c); + } else if (n_ic == 4 && n_oc == 1 && + m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 && + m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) { + + pa_log_info("Using 4-channel to mono remapping"); + pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_c, + (pa_do_remap_func_t)remap_ch4_to_mono_float32ne_c); } else if (pa_setup_remap_arrange(m, arrange) && n_oc == 2) { pa_log_info("Using stereo arrange remapping"); -- 1.7.9.5