SIMD code requiring auxiliary array for best optimization

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello. I have noticed a strange behavior when I'm trying to write SIMD code using provided SSE intrinsics. It looks like GCC is not able to generate/optimize same code like second function (bar) for first function (foo).

I was wondering how can I achieve same generated code for the first function (foo) without going into trouble of defining and using an auxiliary array like second function (bar).

Please consider following piece of code:

/*******************************************************************************************************/
/*BEGIN*************************************************************************************************/

#include <xmmintrin.h>

void foo(const __m128 a[][8], const __m128 b[][8], const __m128 d[][8], __m128 c[][8], unsigned int size)
{
    for (unsigned int i = 0; i < size; i++)
    {
        c[i][0] = _mm_add_ps(a[i][0], _mm_mul_ps(d[i][0], b[i][0]));
        c[i][1] = _mm_add_ps(a[i][1], _mm_mul_ps(d[i][1], b[i][1]));
        c[i][2] = _mm_add_ps(a[i][2], _mm_mul_ps(d[i][2], b[i][2]));
        c[i][3] = _mm_add_ps(a[i][3], _mm_mul_ps(d[i][3], b[i][3]));
        c[i][4] = _mm_add_ps(a[i][4], _mm_mul_ps(d[i][4], b[i][4]));
        c[i][5] = _mm_add_ps(a[i][5], _mm_mul_ps(d[i][5], b[i][5]));
        c[i][6] = _mm_add_ps(a[i][6], _mm_mul_ps(d[i][6], b[i][6]));
        c[i][7] = _mm_add_ps(a[i][7], _mm_mul_ps(d[i][7], b[i][7]));
    }
}

void bar(const __m128 a[][8], const __m128 b[][8], const __m128 d[][8], __m128 c[][8], unsigned int size)
{
    for (unsigned int i = 0; i < size; i++)
    {
        __m128 cx[8];

        cx[0] = _mm_add_ps(a[i][0], _mm_mul_ps(d[i][0], b[i][0]));
        cx[1] = _mm_add_ps(a[i][1], _mm_mul_ps(d[i][1], b[i][1]));
        cx[2] = _mm_add_ps(a[i][2], _mm_mul_ps(d[i][2], b[i][2]));
        cx[3] = _mm_add_ps(a[i][3], _mm_mul_ps(d[i][3], b[i][3]));
        cx[4] = _mm_add_ps(a[i][4], _mm_mul_ps(d[i][4], b[i][4]));
        cx[5] = _mm_add_ps(a[i][5], _mm_mul_ps(d[i][5], b[i][5]));
        cx[6] = _mm_add_ps(a[i][6], _mm_mul_ps(d[i][6], b[i][6]));
        cx[7] = _mm_add_ps(a[i][7], _mm_mul_ps(d[i][7], b[i][7]));

        c[i][0] = cx[0];
        c[i][1] = cx[1];
        c[i][2] = cx[2];
        c[i][3] = cx[3];
        c[i][4] = cx[4];
        c[i][5] = cx[5];
        c[i][6] = cx[6];
        c[i][7] = cx[7];
    }
}

/*END***************************************************************************************************/
/*******************************************************************************************************/

Which results in this:

/*******************************************************************************************************/
/*BEGIN*************************************************************************************************/

foo(float __vector const (*) [8], float __vector const (*) [8], float __vector const (*) [8], float __vector (*) [8], unsigned int):
    test    r8d, r8d
    je    .L1
    xor    eax, eax
.L4:
    movaps    xmm0, XMMWORD PTR [rdx]
    add    eax, 1
    sub    rsi, -128
    sub    rdx, -128
    sub    rdi, -128
    mulps    xmm0, XMMWORD PTR [rsi-128]
    sub    rcx, -128
    addps    xmm0, XMMWORD PTR [rdi-128]
    movaps    XMMWORD PTR [rcx-128], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-112]
    mulps    xmm0, XMMWORD PTR [rsi-112]
    addps    xmm0, XMMWORD PTR [rdi-112]
    movaps    XMMWORD PTR [rcx-112], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-96]
    mulps    xmm0, XMMWORD PTR [rsi-96]
    addps    xmm0, XMMWORD PTR [rdi-96]
    movaps    XMMWORD PTR [rcx-96], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-80]
    mulps    xmm0, XMMWORD PTR [rsi-80]
    addps    xmm0, XMMWORD PTR [rdi-80]
    movaps    XMMWORD PTR [rcx-80], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-64]
    mulps    xmm0, XMMWORD PTR [rsi-64]
    addps    xmm0, XMMWORD PTR [rdi-64]
    movaps    XMMWORD PTR [rcx-64], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-48]
    mulps    xmm0, XMMWORD PTR [rsi-48]
    addps    xmm0, XMMWORD PTR [rdi-48]
    movaps    XMMWORD PTR [rcx-48], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-32]
    mulps    xmm0, XMMWORD PTR [rsi-32]
    addps    xmm0, XMMWORD PTR [rdi-32]
    movaps    XMMWORD PTR [rcx-32], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-16]
    mulps    xmm0, XMMWORD PTR [rsi-16]
    addps    xmm0, XMMWORD PTR [rdi-16]
    movaps    XMMWORD PTR [rcx-16], xmm0
    cmp    eax, r8d
    jne    .L4
.L1:
    rep; ret
bar(float __vector const (*) [8], float __vector const (*) [8], float __vector const (*) [8], float __vector (*) [8], unsigned int):
    test    r8d, r8d
    je    .L6
    xor    eax, eax
.L9:
    movaps    xmm7, XMMWORD PTR [rdx]
    add    eax, 1
    sub    rsi, -128
    sub    rdx, -128
    sub    rdi, -128
    movaps    xmm6, XMMWORD PTR [rdx-112]
    sub    rcx, -128
    mulps    xmm7, XMMWORD PTR [rsi-128]
    movaps    xmm5, XMMWORD PTR [rdx-96]
    mulps    xmm6, XMMWORD PTR [rsi-112]
    movaps    xmm4, XMMWORD PTR [rdx-80]
    mulps    xmm5, XMMWORD PTR [rsi-96]
    movaps    xmm3, XMMWORD PTR [rdx-64]
    mulps    xmm4, XMMWORD PTR [rsi-80]
    movaps    xmm2, XMMWORD PTR [rdx-48]
    mulps    xmm3, XMMWORD PTR [rsi-64]
    movaps    xmm1, XMMWORD PTR [rdx-32]
    mulps    xmm2, XMMWORD PTR [rsi-48]
    movaps    xmm0, XMMWORD PTR [rdx-16]
    mulps    xmm1, XMMWORD PTR [rsi-32]
    addps    xmm7, XMMWORD PTR [rdi-128]
    mulps    xmm0, XMMWORD PTR [rsi-16]
    addps    xmm6, XMMWORD PTR [rdi-112]
    addps    xmm5, XMMWORD PTR [rdi-96]
    addps    xmm4, XMMWORD PTR [rdi-80]
    addps    xmm3, XMMWORD PTR [rdi-64]
    addps    xmm2, XMMWORD PTR [rdi-48]
    addps    xmm1, XMMWORD PTR [rdi-32]
    addps    xmm0, XMMWORD PTR [rdi-16]
    movaps    XMMWORD PTR [rcx-128], xmm7
    movaps    XMMWORD PTR [rcx-112], xmm6
    movaps    XMMWORD PTR [rcx-96], xmm5
    movaps    XMMWORD PTR [rcx-80], xmm4
    movaps    XMMWORD PTR [rcx-64], xmm3
    movaps    XMMWORD PTR [rcx-48], xmm2
    movaps    XMMWORD PTR [rcx-32], xmm1
    movaps    XMMWORD PTR [rcx-16], xmm0
    cmp    eax, r8d
    jne    .L9
.L6:
    rep; ret

/*END***************************************************************************************************/
/*******************************************************************************************************/

This is generated by GCC 4.8.1 on Linux, using -O2 optimization level. (http://gcc.godbolt.org)




[Index of Archives]     [Linux C Programming]     [Linux Kernel]     [eCos]     [Fedora Development]     [Fedora Announce]     [Autoconf]     [The DWARVES Debugging Tools]     [Yosemite Campsites]     [Yosemite News]     [Linux GCC]

  Powered by Linux