After adding SSE path into a C++ ray tracer program I noticed some parts of the program run almost 2x the speed when I compile using the "-pg" profiling flag. Before these additions "-pg" produced a performance hit in the total program executing time as expected. Nice code for vectorization is achieved through a C++ class, that implements arithmetic operator overloading over the basic SSE unit __m128 as follows: class f32x4 { const f32x4&operator+=(const f32x4&v) { m = _mm_add_ps(m, v.m); return(*this); } /*... overload more operators ... */ union { __m128 m; float f[4]; } }; Here is an example of function that takes almost 2x less time to execute: /* C++ */ f32x4b pintersect4(f32x4& a, p3_f32x4& raydir, Vector3& cameyepos, p3_f32x4& hitpoint, f32x4& distance, p3_f32x4& normal) { f32x4 b = Dot(raydir, bpart); f32x4 D = b*b - a*c; // If none of the rays can intersect the sphere then stop f32x4b mask = D > 0.0f; if (ForWhich(mask) == 0) return mask; D=Sqrt(D); f32x4 t = -0.5f*((b+D)/a); // If sphere center is in front of camera surface mask = mask && t > 1.0f; if (ForWhich(mask) == 0) return mask; distance=a*t; // Should be sqrt(a) for real distance hitpoint=(t*raydir) + cameyepos; normal=(hitpoint - pos) / rad; return mask; } I had the same result with both GCC 4.3.2 and 4.4.1. The program is compiled with "-O3 -march-core2" and lowering the -O level doesn't affect the behaviour. The speed difference is measured by monitoring the x86 rdtsc() cycle count and CPU frequency is kept constant. This is x86_64 platform. I am seeking for advice on what could cause this. Here is the assembly output of that function with profiling enabled. Most difference I can notice between this and the one without profiling is that different adresses are being used when loading the MMX registers. _ZN6Sphere11pintersect4ERN6veclib5f32x4ERN5n_std9cvalarrayIS1_Lm3EEERNS4_IfLm3EEES6_S2_S6_: .LFB4330: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 pushq %rbp .cfi_def_cfa_offset 16 movq %rsp, %rbp .cfi_offset 6, -16 .cfi_def_cfa_register 6 subq $368, %rsp call mcount movss 184(%rdi), %xmm2 movss 188(%rdi), %xmm0 movss 192(%rdi), %xmm3 movss 8(%rdx), %xmm1 movss 4(%rdx), %xmm4 leaq 16(%rdx), %r10 mulss %xmm0, %xmm4 mulss %xmm3, %xmm1 movq 16(%rbp), %rax addss %xmm4, %xmm1 movss (%rdx), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm1 movss %xmm1, -16(%rbp) movss 8(%r10), %xmm1 movss 4(%r10), %xmm4 mulss %xmm3, %xmm1 mulss %xmm0, %xmm4 leaq 32(%rdx), %r10 addss %xmm4, %xmm1 movss 16(%rdx), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm1 movss %xmm1, -12(%rbp) movss 8(%r10), %xmm1 movss 4(%r10), %xmm4 mulss %xmm3, %xmm1 mulss %xmm0, %xmm4 leaq 48(%rdx), %r10 addss %xmm4, %xmm1 movss 32(%rdx), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm1 movss %xmm1, -8(%rbp) mulss 8(%r10), %xmm3 mulss 4(%r10), %xmm0 mulss 48(%rdx), %xmm2 addss %xmm3, %xmm0 xorps %xmm3, %xmm3 addss %xmm2, %xmm0 movss %xmm0, -4(%rbp) xorps %xmm0, %xmm0 movaps (%rsi), %xmm4 movss 180(%rdi), %xmm2 movlps -16(%rbp), %xmm0 shufps $0, %xmm2, %xmm2 movhps -8(%rbp), %xmm0 mulps %xmm4, %xmm2 movaps %xmm0, %xmm1 mulps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps %xmm3, %xmm2 cmpltps %xmm1, %xmm2 movlps %xmm2, -96(%rbp) movhps %xmm2, -88(%rbp) movq -96(%rbp), %r10 movq -88(%rbp), %rsi movd %r10, %xmm5 shufps $0xe4, %xmm3, %xmm5 movaps %xmm5, %xmm2 movd %rsi, %xmm5 movlhps %xmm5, %xmm2 movmskps %xmm2, %r11d testl %r11d, %r11d je .L98 sqrtps %xmm1, %xmm1 addps %xmm0, %xmm1 movaps .LC10(%rip), %xmm0 divps %xmm4, %xmm1 mulps .LC9(%rip), %xmm1 cmpltps %xmm1, %xmm0 movlps %xmm0, -144(%rbp) movhps %xmm0, -136(%rbp) movq -144(%rbp), %rsi movq %rsi, -48(%rbp) movq -136(%rbp), %rsi movq %rsi, -40(%rbp) andps -48(%rbp), %xmm2 movlps %xmm2, -192(%rbp) movhps %xmm2, -184(%rbp) movq -192(%rbp), %r10 movq -184(%rbp), %rsi movd %r10, %xmm2 movd %rsi, %xmm5 shufps $0xe4, %xmm3, %xmm2 movq %r10, -32(%rbp) movaps %xmm2, %xmm0 movq %rsi, -24(%rbp) movlhps %xmm5, %xmm0 movmskps %xmm0, %r11d testl %r11d, %r11d je .L98 mulps %xmm1, %xmm4 movaps %xmm4, (%r9) mulps (%rdx), %xmm1 movss (%rcx), %xmm0 leaq -256(%rbp), %rdx shufps $0, %xmm0, %xmm0 cmpq %rdx, %r8 addps %xmm0, %xmm1 je .L87 .L88: movaps %xmm1, (%r8) movaps %xmm3, %xmm0 movlps -240(%rbp), %xmm0 movhps -232(%rbp), %xmm0 movlps %xmm0, 16(%r8) movhps %xmm0, 24(%r8) movlps -224(%rbp), %xmm3 movhps -216(%rbp), %xmm3 movlps %xmm3, 32(%r8) movhps %xmm3, 40(%r8) .L87: .L90: .L92: movss 8(%rdi), %xmm0 movss 140(%rdi), %xmm2 shufps $0, %xmm0, %xmm0 shufps $0, %xmm2, %xmm2 subps %xmm0, %xmm1 leaq -352(%rbp), %rdx divps %xmm2, %xmm1 xorps %xmm0, %xmm0 movlps %xmm1, -352(%rbp) movlps -288(%rbp), %xmm0 movhps %xmm1, -344(%rbp) movhps -280(%rbp), %xmm0 cmpq %rdx, %rax divps %xmm2, %xmm0 movlps %xmm0, -336(%rbp) movhps %xmm0, -328(%rbp) xorps %xmm0, %xmm0 movlps -272(%rbp), %xmm0 movhps -264(%rbp), %xmm0 divps %xmm2, %xmm0 movlps %xmm0, -320(%rbp) movhps %xmm0, -312(%rbp) je .L94 .L95: xorps %xmm0, %xmm0 movlps -352(%rbp), %xmm0 movhps -344(%rbp), %xmm0 movlps %xmm0, (%rax) movhps %xmm0, 8(%rax) xorps %xmm0, %xmm0 movlps -336(%rbp), %xmm0 movhps -328(%rbp), %xmm0 movlps %xmm0, 16(%rax) movhps %xmm0, 24(%rax) xorps %xmm0, %xmm0 movlps -320(%rbp), %xmm0 movhps -312(%rbp), %xmm0 movlps %xmm0, 32(%rax) movhps %xmm0, 40(%rax) .L94: .L96: movq -32(%rbp), %rax movq %rax, -64(%rbp) movq -24(%rbp), %rax movq -64(%rbp), %xmm0 movq %rax, -56(%rbp) movd %rax, %xmm1 leave ret .p2align 4,,10 .p2align 3 .L98: movq %r10, -64(%rbp) movq %rsi, -56(%rbp) movd %r10, %xmm0 movd %rsi, %xmm1 leave ret .cfi_endproc And here is the same thing without profiling: _ZN6Sphere11pintersect4ERN6veclib5f32x4ERN5n_std9cvalarrayIS1_Lm3EEERNS4_IfLm3EEES6_S2_S6_: .LFB4330: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 subq $256, %rsp .cfi_def_cfa_offset 264 leaq 16(%rdx), %r10 movss 8(%rdx), %xmm1 movss 4(%rdx), %xmm4 movss 184(%rdi), %xmm2 movss 188(%rdi), %xmm0 movss 192(%rdi), %xmm3 mulss %xmm0, %xmm4 mulss %xmm3, %xmm1 movq 264(%rsp), %rax addss %xmm4, %xmm1 movss (%rdx), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm1 movss %xmm1, 232(%rsp) movss 8(%r10), %xmm1 movss 4(%r10), %xmm4 mulss %xmm3, %xmm1 mulss %xmm0, %xmm4 leaq 32(%rdx), %r10 addss %xmm4, %xmm1 movss 16(%rdx), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm1 movss %xmm1, 236(%rsp) movss 8(%r10), %xmm1 movss 4(%r10), %xmm4 mulss %xmm3, %xmm1 mulss %xmm0, %xmm4 leaq 48(%rdx), %r10 addss %xmm4, %xmm1 movss 32(%rdx), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm1 movss %xmm1, 240(%rsp) mulss 8(%r10), %xmm3 mulss 4(%r10), %xmm0 mulss 48(%rdx), %xmm2 addss %xmm3, %xmm0 xorps %xmm3, %xmm3 addss %xmm2, %xmm0 movss 180(%rdi), %xmm2 movss %xmm0, 244(%rsp) shufps $0, %xmm2, %xmm2 movaps (%rsi), %xmm4 xorps %xmm0, %xmm0 mulps %xmm4, %xmm2 movlps 232(%rsp), %xmm0 movhps 240(%rsp), %xmm0 movaps %xmm0, %xmm1 mulps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps %xmm3, %xmm2 cmpltps %xmm1, %xmm2 movlps %xmm2, 152(%rsp) movhps %xmm2, 160(%rsp) movq 152(%rsp), %r10 movq 160(%rsp), %rsi movd %r10, %xmm5 shufps $0xe4, %xmm3, %xmm5 movaps %xmm5, %xmm2 movd %rsi, %xmm5 movlhps %xmm5, %xmm2 movmskps %xmm2, %r11d testl %r11d, %r11d je .L98 sqrtps %xmm1, %xmm1 addps %xmm0, %xmm1 movaps .LC10(%rip), %xmm0 divps %xmm4, %xmm1 mulps .LC9(%rip), %xmm1 cmpltps %xmm1, %xmm0 movlps %xmm0, 104(%rsp) movhps %xmm0, 112(%rsp) movq 104(%rsp), %rsi movq %rsi, 200(%rsp) movq 112(%rsp), %rsi movq %rsi, 208(%rsp) andps 200(%rsp), %xmm2 movlps %xmm2, 56(%rsp) movhps %xmm2, 64(%rsp) movq 56(%rsp), %r10 movq 64(%rsp), %rsi movd %r10, %xmm2 movd %rsi, %xmm5 shufps $0xe4, %xmm3, %xmm2 movq %r10, 216(%rsp) movaps %xmm2, %xmm0 movq %rsi, 224(%rsp) movlhps %xmm5, %xmm0 movmskps %xmm0, %r11d testl %r11d, %r11d je .L98 mulps %xmm1, %xmm4 movaps %xmm4, (%r9) mulps (%rdx), %xmm1 movss (%rcx), %xmm0 leaq -8(%rsp), %rdx shufps $0, %xmm0, %xmm0 cmpq %rdx, %r8 addps %xmm0, %xmm1 je .L87 .L88: movaps %xmm3, %xmm0 movaps %xmm1, (%r8) movlps 8(%rsp), %xmm0 movlps 24(%rsp), %xmm3 movhps 16(%rsp), %xmm0 movhps 32(%rsp), %xmm3 movlps %xmm0, 16(%r8) movhps %xmm0, 24(%r8) movlps %xmm3, 32(%r8) movhps %xmm3, 40(%r8) .L87: .L90: .L92: movss 8(%rdi), %xmm0 movss 140(%rdi), %xmm2 shufps $0, %xmm0, %xmm0 shufps $0, %xmm2, %xmm2 subps %xmm0, %xmm1 leaq -104(%rsp), %rdx divps %xmm2, %xmm1 xorps %xmm0, %xmm0 movlps %xmm1, -104(%rsp) movlps -40(%rsp), %xmm0 movhps %xmm1, -96(%rsp) movhps -32(%rsp), %xmm0 cmpq %rdx, %rax divps %xmm2, %xmm0 movlps %xmm0, -88(%rsp) movhps %xmm0, -80(%rsp) xorps %xmm0, %xmm0 movlps -24(%rsp), %xmm0 movhps -16(%rsp), %xmm0 divps %xmm2, %xmm0 movlps %xmm0, -72(%rsp) movhps %xmm0, -64(%rsp) je .L94 .L95: xorps %xmm0, %xmm0 movlps -104(%rsp), %xmm0 movhps -96(%rsp), %xmm0 movlps %xmm0, (%rax) movhps %xmm0, 8(%rax) xorps %xmm0, %xmm0 movlps -88(%rsp), %xmm0 movhps -80(%rsp), %xmm0 movlps %xmm0, 16(%rax) movhps %xmm0, 24(%rax) xorps %xmm0, %xmm0 movlps -72(%rsp), %xmm0 movhps -64(%rsp), %xmm0 movlps %xmm0, 32(%rax) movhps %xmm0, 40(%rax) .L94: movq 216(%rsp), %rax movq %rax, 184(%rsp) movq 224(%rsp), %rax movq 184(%rsp), %xmm0 movq %rax, 192(%rsp) movd %rax, %xmm1 addq $256, %rsp ret .p2align 4,,10 .p2align 3 .L98: .L96: movq %r10, 184(%rsp) movq %rsi, 192(%rsp) movd %r10, %xmm0 movd %rsi, %xmm1 addq $256, %rsp ret .cfi_endproc -- Tuomas Tonteri