Initializing a vector to zero leads to less efficient assemblies than manually assigning a vector to zero?

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi all,

I tried to compile the following two code snippets with "--std=c++14 -mavx2 -O3" options:

    double tmp_values[4] = {0};

and

    double tmp_values[4];

    for (auto i = 0; i < 4; ++i) {
        tmp_values[i] = 0.0; 
    }

The first code snippet leads to

    vmovaps XMMWORD PTR [rsp], xmm0
    vmovaps XMMWORD PTR [rsp+16], xmm0

But the second leads to only

    vmovapd YMMWORD PTR [rsp], ymm0

which is less efficient than the previous one. Am I missing something?

For the full code, see this godbolt link: https://godbolt.org/z/jonf72 , and I paste the full input and output below:

Input code

#include <cstring>

double loadu1(const void* ptr, int count) {

    double tmp_values[4] = {0};

    std::memcpy(
        tmp_values,
        ptr,
        count * sizeof(double));
    return tmp_values[0] + tmp_values[1] + tmp_values[2] + tmp_values[3];
}


double loadu2(const void* ptr, int count) {

    double tmp_values[4];

    for (auto i = 0; i < 4; ++i) {
        tmp_values[i] = 0.0; 
    }

    std::memcpy(
        tmp_values,
        ptr,
        count * sizeof(double));
    return tmp_values[0] + tmp_values[1] + tmp_values[2] + tmp_values[3];
}


Output assemblies:

loadu1(void const*, int):
        sub     rsp, 40
        movsx   rdx, esi
        vpxor   xmm0, xmm0, xmm0
        mov     rsi, rdi
        sal     rdx, 3
        mov     rdi, rsp
        vmovaps XMMWORD PTR [rsp], xmm0
        vmovaps XMMWORD PTR [rsp+16], xmm0
        call    memcpy
        vmovsd  xmm0, QWORD PTR [rsp]
        vaddsd  xmm0, xmm0, QWORD PTR [rsp+8]
        vaddsd  xmm0, xmm0, QWORD PTR [rsp+16]
        vaddsd  xmm0, xmm0, QWORD PTR [rsp+24]
        add     rsp, 40
        ret
loadu2(void const*, int):
        push    rbp
        movsx   rdx, esi
        vxorpd  xmm0, xmm0, xmm0
        mov     rsi, rdi
        sal     rdx, 3
        mov     rbp, rsp
        and     rsp, -32
        sub     rsp, 32
        mov     rdi, rsp
        vmovapd YMMWORD PTR [rsp], ymm0
        vzeroupper
        call    memcpy
        vmovsd  xmm0, QWORD PTR [rsp]
        vaddsd  xmm0, xmm0, QWORD PTR [rsp+8]
        vaddsd  xmm0, xmm0, QWORD PTR [rsp+16]
        vaddsd  xmm0, xmm0, QWORD PTR [rsp+24]
        leave
        ret

Thanks!
Hong





[Index of Archives]     [Linux C Programming]     [Linux Kernel]     [eCos]     [Fedora Development]     [Fedora Announce]     [Autoconf]     [The DWARVES Debugging Tools]     [Yosemite Campsites]     [Yosemite News]     [Linux GCC]

  Powered by Linux