Hello. I have noticed a strange behavior when I'm trying to write SIMD
code using provided SSE intrinsics. It looks like GCC is not able to
generate/optimize same code like second function (bar) for first
function (foo).
I was wondering how can I achieve same generated code for the first
function (foo) without going into trouble of defining and using an
auxiliary array like second function (bar).
Please consider following piece of code:
/*******************************************************************************************************/
/*BEGIN*************************************************************************************************/
#include <xmmintrin.h>
void foo(const __m128 a[][8], const __m128 b[][8], const __m128 d[][8],
__m128 c[][8], unsigned int size)
{
for (unsigned int i = 0; i < size; i++)
{
c[i][0] = _mm_add_ps(a[i][0], _mm_mul_ps(d[i][0], b[i][0]));
c[i][1] = _mm_add_ps(a[i][1], _mm_mul_ps(d[i][1], b[i][1]));
c[i][2] = _mm_add_ps(a[i][2], _mm_mul_ps(d[i][2], b[i][2]));
c[i][3] = _mm_add_ps(a[i][3], _mm_mul_ps(d[i][3], b[i][3]));
c[i][4] = _mm_add_ps(a[i][4], _mm_mul_ps(d[i][4], b[i][4]));
c[i][5] = _mm_add_ps(a[i][5], _mm_mul_ps(d[i][5], b[i][5]));
c[i][6] = _mm_add_ps(a[i][6], _mm_mul_ps(d[i][6], b[i][6]));
c[i][7] = _mm_add_ps(a[i][7], _mm_mul_ps(d[i][7], b[i][7]));
}
}
void bar(const __m128 a[][8], const __m128 b[][8], const __m128 d[][8],
__m128 c[][8], unsigned int size)
{
for (unsigned int i = 0; i < size; i++)
{
__m128 cx[8];
cx[0] = _mm_add_ps(a[i][0], _mm_mul_ps(d[i][0], b[i][0]));
cx[1] = _mm_add_ps(a[i][1], _mm_mul_ps(d[i][1], b[i][1]));
cx[2] = _mm_add_ps(a[i][2], _mm_mul_ps(d[i][2], b[i][2]));
cx[3] = _mm_add_ps(a[i][3], _mm_mul_ps(d[i][3], b[i][3]));
cx[4] = _mm_add_ps(a[i][4], _mm_mul_ps(d[i][4], b[i][4]));
cx[5] = _mm_add_ps(a[i][5], _mm_mul_ps(d[i][5], b[i][5]));
cx[6] = _mm_add_ps(a[i][6], _mm_mul_ps(d[i][6], b[i][6]));
cx[7] = _mm_add_ps(a[i][7], _mm_mul_ps(d[i][7], b[i][7]));
c[i][0] = cx[0];
c[i][1] = cx[1];
c[i][2] = cx[2];
c[i][3] = cx[3];
c[i][4] = cx[4];
c[i][5] = cx[5];
c[i][6] = cx[6];
c[i][7] = cx[7];
}
}
/*END***************************************************************************************************/
/*******************************************************************************************************/
Which results in this:
/*******************************************************************************************************/
/*BEGIN*************************************************************************************************/
foo(float __vector const (*) [8], float __vector const (*) [8], float
__vector const (*) [8], float __vector (*) [8], unsigned int):
test r8d, r8d
je .L1
xor eax, eax
.L4:
movaps xmm0, XMMWORD PTR [rdx]
add eax, 1
sub rsi, -128
sub rdx, -128
sub rdi, -128
mulps xmm0, XMMWORD PTR [rsi-128]
sub rcx, -128
addps xmm0, XMMWORD PTR [rdi-128]
movaps XMMWORD PTR [rcx-128], xmm0
movaps xmm0, XMMWORD PTR [rdx-112]
mulps xmm0, XMMWORD PTR [rsi-112]
addps xmm0, XMMWORD PTR [rdi-112]
movaps XMMWORD PTR [rcx-112], xmm0
movaps xmm0, XMMWORD PTR [rdx-96]
mulps xmm0, XMMWORD PTR [rsi-96]
addps xmm0, XMMWORD PTR [rdi-96]
movaps XMMWORD PTR [rcx-96], xmm0
movaps xmm0, XMMWORD PTR [rdx-80]
mulps xmm0, XMMWORD PTR [rsi-80]
addps xmm0, XMMWORD PTR [rdi-80]
movaps XMMWORD PTR [rcx-80], xmm0
movaps xmm0, XMMWORD PTR [rdx-64]
mulps xmm0, XMMWORD PTR [rsi-64]
addps xmm0, XMMWORD PTR [rdi-64]
movaps XMMWORD PTR [rcx-64], xmm0
movaps xmm0, XMMWORD PTR [rdx-48]
mulps xmm0, XMMWORD PTR [rsi-48]
addps xmm0, XMMWORD PTR [rdi-48]
movaps XMMWORD PTR [rcx-48], xmm0
movaps xmm0, XMMWORD PTR [rdx-32]
mulps xmm0, XMMWORD PTR [rsi-32]
addps xmm0, XMMWORD PTR [rdi-32]
movaps XMMWORD PTR [rcx-32], xmm0
movaps xmm0, XMMWORD PTR [rdx-16]
mulps xmm0, XMMWORD PTR [rsi-16]
addps xmm0, XMMWORD PTR [rdi-16]
movaps XMMWORD PTR [rcx-16], xmm0
cmp eax, r8d
jne .L4
.L1:
rep; ret
bar(float __vector const (*) [8], float __vector const (*) [8], float
__vector const (*) [8], float __vector (*) [8], unsigned int):
test r8d, r8d
je .L6
xor eax, eax
.L9:
movaps xmm7, XMMWORD PTR [rdx]
add eax, 1
sub rsi, -128
sub rdx, -128
sub rdi, -128
movaps xmm6, XMMWORD PTR [rdx-112]
sub rcx, -128
mulps xmm7, XMMWORD PTR [rsi-128]
movaps xmm5, XMMWORD PTR [rdx-96]
mulps xmm6, XMMWORD PTR [rsi-112]
movaps xmm4, XMMWORD PTR [rdx-80]
mulps xmm5, XMMWORD PTR [rsi-96]
movaps xmm3, XMMWORD PTR [rdx-64]
mulps xmm4, XMMWORD PTR [rsi-80]
movaps xmm2, XMMWORD PTR [rdx-48]
mulps xmm3, XMMWORD PTR [rsi-64]
movaps xmm1, XMMWORD PTR [rdx-32]
mulps xmm2, XMMWORD PTR [rsi-48]
movaps xmm0, XMMWORD PTR [rdx-16]
mulps xmm1, XMMWORD PTR [rsi-32]
addps xmm7, XMMWORD PTR [rdi-128]
mulps xmm0, XMMWORD PTR [rsi-16]
addps xmm6, XMMWORD PTR [rdi-112]
addps xmm5, XMMWORD PTR [rdi-96]
addps xmm4, XMMWORD PTR [rdi-80]
addps xmm3, XMMWORD PTR [rdi-64]
addps xmm2, XMMWORD PTR [rdi-48]
addps xmm1, XMMWORD PTR [rdi-32]
addps xmm0, XMMWORD PTR [rdi-16]
movaps XMMWORD PTR [rcx-128], xmm7
movaps XMMWORD PTR [rcx-112], xmm6
movaps XMMWORD PTR [rcx-96], xmm5
movaps XMMWORD PTR [rcx-80], xmm4
movaps XMMWORD PTR [rcx-64], xmm3
movaps XMMWORD PTR [rcx-48], xmm2
movaps XMMWORD PTR [rcx-32], xmm1
movaps XMMWORD PTR [rcx-16], xmm0
cmp eax, r8d
jne .L9
.L6:
rep; ret
/*END***************************************************************************************************/
/*******************************************************************************************************/
This is generated by GCC 4.8.1 on Linux, using -O2 optimization level.
(http://gcc.godbolt.org)