Hi gcc-help, I have a question regarding an optimization example for the Sandy Bridge march. In the Intel 64 and IA-32 Architectures Optimization Reference Manual [1] on page 183, sec 3.6.1.2, there is a piece about optimizing L1D cache latency w.r.t utilizing the double load ports. In example 3-32, assembly from Intel's compiler is shown. I'm trying to reproduce this with Debian Wheezy's packaged gcc (gcc version 4.6.2 (Debian 4.6.2-12)). I get assembly which is very similar, but it's not taking advantage of the double load ports. Is there any way to turn this on in my gcc version, or is it not implemented yet? Code, b.c: #define BUFF_SIZE 1024 int main() { int buff[BUFF_SIZE]; int sum = 0; int i; for(i = 0; i < BUFF_SIZE; i++) { sum+=buff[i]; } return sum; } Most closely resembling compile option I've found: gcc -march=corei7 -msse -O3 -funroll-loops -g -o b b.c (or, to really find sandy bridge, but deactivate the AVX encoding because that's not what I want: gcc -march=corei7-avx -mno-avx -msse -O3 -funroll-loops -g -o b b.c ) Assembly, in both cases: <snip> 00000000004003b0 <main>: #define BUFF_SIZE 1024 int main() { 4003b0: 48 81 ec 90 0f 00 00 sub $0xf90,%rsp 4003b7: 66 0f ef c0 pxor %xmm0,%xmm0 4003bb: 48 8d 44 24 88 lea -0x78(%rsp),%rax 4003c0: 48 8d 94 24 88 0f 00 lea 0xf88(%rsp),%rdx 4003c7: 00 4003c8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1) 4003cf: 00 int buff[BUFF_SIZE]; int sum = 0; int i; for(i = 0; i < BUFF_SIZE; i++) { sum+=buff[i]; 4003d0: 66 0f fe 00 paddd (%rax),%xmm0 4003d4: 66 0f fe 40 10 paddd 0x10(%rax),%xmm0 4003d9: 66 0f fe 40 20 paddd 0x20(%rax),%xmm0 4003de: 66 0f fe 40 30 paddd 0x30(%rax),%xmm0 4003e3: 66 0f fe 40 40 paddd 0x40(%rax),%xmm0 4003e8: 66 0f fe 40 50 paddd 0x50(%rax),%xmm0 4003ed: 66 0f fe 40 60 paddd 0x60(%rax),%xmm0 4003f2: 66 0f fe 40 70 paddd 0x70(%rax),%xmm0 4003f7: 48 83 e8 80 sub $0xffffffffffffff80,%rax 4003fb: 48 39 d0 cmp %rdx,%rax 4003fe: 75 d0 jne 4003d0 <main+0x20> 400400: 66 0f 6f d0 movdqa %xmm0,%xmm2 } return sum; } 400404: 48 81 c4 90 0f 00 00 add $0xf90,%rsp int buff[BUFF_SIZE]; int sum = 0; int i; for(i = 0; i < BUFF_SIZE; i++) { sum+=buff[i]; 40040b: 66 0f 73 da 08 psrldq $0x8,%xmm2 400410: 66 0f fe c2 paddd %xmm2,%xmm0 400414: 66 0f 6f c8 movdqa %xmm0,%xmm1 400418: 66 0f 73 d9 04 psrldq $0x4,%xmm1 } return sum; 40041d: 66 0f fe c1 paddd %xmm1,%xmm0 400421: 66 0f 7e c0 movd %xmm0,%eax } 400425: c3 retq 400426: 90 nop 400427: 90 nop <snip> Best regards, Martin [1] http://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf