I am trying to tune the performance of hand-unrolled code. I was wondering what cmd-line options should I specify in order to get h[i+1] loaded before the store to g[i]: Code: void foo(int * __restrict g, int * __restrict h) { int i; for (i = 0; i < 4096; i+=2) { g[i] = h[i] + 10; g[i+1] = h[i+1] + 10; } } Command line: gcc-4.0.2 -O3 loop.c -fargument-noalias-global -fstrict-aliasing -S loop.s Assembly code of the loop body: .L2: leal 0(,%ebx,4), %eax leal (%eax,%esi), %ecx leal (%edi,%eax), %eax movl -8(%ecx), %edx // = h[i] addl $10, %edx // + 10 movl %edx, -8(%eax) // g[i] = movl -4(%ecx), %edx // = h[i+1] addl $10, %edx // + 10 movl %edx, -4(%eax) // g[i+1] = addl $2, %ebx cmpl $4098, %ebx jne .L2 Thanks, -Ben