Hello all again, I now have a question regarding the prefetch command generation for x86_64. The option -fprefetch-loop-arrays generates prefetch commands for non-vectorized code, but not for vectorized one. Is that the intended functionality ? Is there a way to get prefetching also for the vectorized routines ? Thanks, Thomas Example: (compiled with g++-4.5.0) g++ -O3 -fcx-fortran-rules -fprefetch-loop-arrays -mtune=core2 -march=core2 -mssse3 -S -c ../test_loop.cpp The code for a complex multiplication loop done this way: void f(std::complex<float> *a, std::complex<float> *b, std::complex<float> *r) { for(std::size_t s=0; s<N; s++) r[s] = a[s]*b[s]; } Is generated two-fold, one vectorized (.L3) and one not (L5): .LFB1292: leaq 32(%rdi), %rcx leaq 32(%rdx), %rax cmpq %rcx, %rdx seta %r8b cmpq %rdi, %rax setb %cl orb %r8b, %cl jne .L9 .L2: movq %rdx, %rax xorl %edx, %edx .p2align 4,,10 .p2align 3 .L5: movss (%rdi), %xmm4 movss 4(%rdi), %xmm3 movss (%rsi), %xmm2 movss 4(%rsi), %xmm0 prefetcht0 80(%rdi) movaps %xmm4, %xmm1 movaps %xmm3, %xmm5 mulss %xmm2, %xmm1 mulss %xmm0, %xmm5 mulss %xmm3, %xmm2 subss %xmm5, %xmm1 mulss %xmm4, %xmm0 movss %xmm1, (%rax) addss %xmm2, %xmm0 incq %rdx movss %xmm0, 4(%rax) addq $8, %rdi addq $8, %rsi addq $8, %rax cmpq $2097152, %rdx jne .L5 rep ret .L9: leaq 32(%rsi), %rcx cmpq %rcx, %rdx seta %cl cmpq %rax, %rsi seta %al orb %al, %cl je .L2 movq %rdx, %rax xorps %xmm3, %xmm3 leaq 16777216(%rdi), %rdx .p2align 4,,10 .p2align 3 .L3: movaps %xmm3, %xmm2 movaps %xmm3, %xmm1 movlps (%rdi), %xmm2 movlps 16(%rdi), %xmm1 movhps 8(%rdi), %xmm2 movhps 24(%rdi), %xmm1 movaps %xmm2, %xmm0 movaps %xmm2, %xmm5 shufps $136, %xmm1, %xmm0 shufps $221, %xmm1, %xmm5 movaps %xmm3, %xmm2 movaps %xmm3, %xmm1 movlps (%rsi), %xmm2 movlps 16(%rsi), %xmm1 movhps 8(%rsi), %xmm2 movhps 24(%rsi), %xmm1 movaps %xmm2, %xmm4 movaps %xmm5, %xmm6 shufps $221, %xmm1, %xmm2 shufps $136, %xmm1, %xmm4 mulps %xmm2, %xmm6 movaps %xmm0, %xmm1 addq $32, %rdi mulps %xmm4, %xmm1 mulps %xmm2, %xmm0 subps %xmm6, %xmm1 mulps %xmm5, %xmm4 movaps %xmm1, %xmm2 addps %xmm4, %xmm0 addq $32, %rsi unpcklps %xmm0, %xmm2 unpckhps %xmm0, %xmm1 movlps %xmm2, (%rax) movhps %xmm2, 8(%rax) movlps %xmm1, 16(%rax) movhps %xmm1, 24(%rax) addq $32, %rax cmpq %rdx, %rdi jne .L3 rep ret