Hello,
I am trying to generate MMX code, but had no luck so far.
Here is the piece of code:
typedef int v2si __attribute__ ((vector_size (8)));
typedef int v4si __attribute__ ((vector_size (16)));
v2si foo(v2si a, v2si b)
{
return a+b;
}
v4si bar(v4si a, v4si b)
{
return a+b;
}
int main()
{
return 0;
}
When compiling with GCC 4.5 "gcc -mmmx -msse4.2 -O2 -S", the function bar nicely
uses SSE instructions. But foo looks pretty bad. I expected a single paddd
instruction.
foo:
pushl %ebp
movl %esp, %ebp
subl $64, %esp
movq %mm0, -56(%ebp)
movl %ebx, -12(%ebp)
movl -56(%ebp), %ebx
movl %esi, -8(%ebp)
movl -52(%ebp), %esi
movq %mm1, -56(%ebp)
movl -56(%ebp), %ecx
movl %edi, -4(%ebp)
movl -52(%ebp), %edi
movl %ebx, -48(%ebp)
movl %esi, -44(%ebp)
movl -8(%ebp), %esi
leal (%ecx,%ebx), %eax
movl -12(%ebp), %ebx
movl %eax, -56(%ebp)
movl %edi, -36(%ebp)
movl -36(%ebp), %eax
movd -56(%ebp), %xmm0
addl -44(%ebp), %eax
movl %ecx, -40(%ebp)
movl -4(%ebp), %edi
pinsrd $0x1, %eax, %xmm0
movq %xmm0, -64(%ebp)
movdq2q %xmm0, %mm0
movl %ebp, %esp
popl %ebp
ret
From the dumps produced by -fdump-tree-all, it seems that veclower splits the
64-bit vectors.
Any idea what I am missing?
Thanks,
--
Erven.