GCC gives major speed up with "-pg" flag for SIMD code

After adding SSE path into a C++ ray tracer program I noticed some parts
of the program run almost 2x the speed when I compile using the "-pg"
profiling flag. Before these additions "-pg" produced a performance
hit in the total program executing time as expected.

Nice code for vectorization is achieved through a C++ class, that
implements arithmetic operator overloading over the basic SSE unit
__m128 as follows:

class f32x4
  const f32x4&operator+=(const f32x4&v)
     m = _mm_add_ps(m, v.m);

 /*... overload more operators ... */

    __m128 m;
    float  f[4];

Here is an example of function that takes almost 2x less time to

/* C++ */

f32x4b pintersect4(f32x4& a, p3_f32x4& raydir, Vector3& cameyepos,
p3_f32x4& hitpoint, f32x4& distance, p3_f32x4& normal) { f32x4 b =
Dot(raydir, bpart); f32x4 D = b*b - a*c;

// If none of the rays can intersect the sphere then stop
f32x4b mask = D > 0.0f;

if (ForWhich(mask) == 0)
        return mask;


f32x4 t = -0.5f*((b+D)/a);

// If sphere center is in front of camera surface
mask = mask && t > 1.0f;

if (ForWhich(mask) == 0)
        return mask;

distance=a*t; // Should be sqrt(a) for real distance
hitpoint=(t*raydir) + cameyepos;
normal=(hitpoint - pos) / rad;

return mask;

I had the same result with both GCC 4.3.2 and 4.4.1. The program is
compiled with "-O3 -march-core2" and lowering the -O level doesn't
affect the behaviour. The speed difference is measured by monitoring
the x86 rdtsc() cycle count and CPU frequency is kept constant. This is
x86_64 platform.

I am seeking for advice on what could cause this.

Here is the assembly output of that function with profiling enabled.
Most difference I can notice between this and the one without profiling
is that different adresses are being used when loading the MMX

	.cfi_personality 0x3,__gxx_personality_v0
	pushq	%rbp
	.cfi_def_cfa_offset 16
	movq	%rsp, %rbp
	.cfi_offset 6, -16
	.cfi_def_cfa_register 6
	subq	$368, %rsp
	call	mcount
	movss	184(%rdi), %xmm2
	movss	188(%rdi), %xmm0
	movss	192(%rdi), %xmm3
	movss	8(%rdx), %xmm1
	movss	4(%rdx), %xmm4
	leaq	16(%rdx), %r10
	mulss	%xmm0, %xmm4
	mulss	%xmm3, %xmm1
	movq	16(%rbp), %rax
	addss	%xmm4, %xmm1
	movss	(%rdx), %xmm4
	mulss	%xmm2, %xmm4
	addss	%xmm4, %xmm1
	movss	%xmm1, -16(%rbp)
	movss	8(%r10), %xmm1
	movss	4(%r10), %xmm4
	mulss	%xmm3, %xmm1
	mulss	%xmm0, %xmm4
	leaq	32(%rdx), %r10
	addss	%xmm4, %xmm1
	movss	16(%rdx), %xmm4
	mulss	%xmm2, %xmm4
	addss	%xmm4, %xmm1
	movss	%xmm1, -12(%rbp)
	movss	8(%r10), %xmm1
	movss	4(%r10), %xmm4
	mulss	%xmm3, %xmm1
	mulss	%xmm0, %xmm4
	leaq	48(%rdx), %r10
	addss	%xmm4, %xmm1
	movss	32(%rdx), %xmm4
	mulss	%xmm2, %xmm4
	addss	%xmm4, %xmm1
	movss	%xmm1, -8(%rbp)
	mulss	8(%r10), %xmm3
	mulss	4(%r10), %xmm0
	mulss	48(%rdx), %xmm2
	addss	%xmm3, %xmm0
	xorps	%xmm3, %xmm3
	addss	%xmm2, %xmm0
	movss	%xmm0, -4(%rbp)
	xorps	%xmm0, %xmm0
	movaps	(%rsi), %xmm4
	movss	180(%rdi), %xmm2
	movlps	-16(%rbp), %xmm0
	shufps	$0, %xmm2, %xmm2
	movhps	-8(%rbp), %xmm0
	mulps	%xmm4, %xmm2
	movaps	%xmm0, %xmm1
	mulps	%xmm0, %xmm1
	subps	%xmm2, %xmm1
	movaps	%xmm3, %xmm2
	cmpltps	%xmm1, %xmm2
	movlps	%xmm2, -96(%rbp)
	movhps	%xmm2, -88(%rbp)
	movq	-96(%rbp), %r10
	movq	-88(%rbp), %rsi
	movd	%r10, %xmm5
	shufps	$0xe4, %xmm3, %xmm5
	movaps	%xmm5, %xmm2
	movd	%rsi, %xmm5
	movlhps	%xmm5, %xmm2
	movmskps	%xmm2, %r11d
	testl	%r11d, %r11d
	je	.L98
	sqrtps	%xmm1, %xmm1
	addps	%xmm0, %xmm1
	movaps	.LC10(%rip), %xmm0
	divps	%xmm4, %xmm1
	mulps	.LC9(%rip), %xmm1
	cmpltps	%xmm1, %xmm0
	movlps	%xmm0, -144(%rbp)
	movhps	%xmm0, -136(%rbp)
	movq	-144(%rbp), %rsi
	movq	%rsi, -48(%rbp)
	movq	-136(%rbp), %rsi
	movq	%rsi, -40(%rbp)
	andps	-48(%rbp), %xmm2
	movlps	%xmm2, -192(%rbp)
	movhps	%xmm2, -184(%rbp)
	movq	-192(%rbp), %r10
	movq	-184(%rbp), %rsi
	movd	%r10, %xmm2
	movd	%rsi, %xmm5
	shufps	$0xe4, %xmm3, %xmm2
	movq	%r10, -32(%rbp)
	movaps	%xmm2, %xmm0
	movq	%rsi, -24(%rbp)
	movlhps	%xmm5, %xmm0
	movmskps	%xmm0, %r11d
	testl	%r11d, %r11d
	je	.L98
	mulps	%xmm1, %xmm4
	movaps	%xmm4, (%r9)
	mulps	(%rdx), %xmm1
	movss	(%rcx), %xmm0
	leaq	-256(%rbp), %rdx
	shufps	$0, %xmm0, %xmm0
	cmpq	%rdx, %r8
	addps	%xmm0, %xmm1
	je	.L87
	movaps	%xmm1, (%r8)
	movaps	%xmm3, %xmm0
	movlps	-240(%rbp), %xmm0
	movhps	-232(%rbp), %xmm0
	movlps	%xmm0, 16(%r8)
	movhps	%xmm0, 24(%r8)
	movlps	-224(%rbp), %xmm3
	movhps	-216(%rbp), %xmm3
	movlps	%xmm3, 32(%r8)
	movhps	%xmm3, 40(%r8)
	movss	8(%rdi), %xmm0
	movss	140(%rdi), %xmm2
	shufps	$0, %xmm0, %xmm0
	shufps	$0, %xmm2, %xmm2
	subps	%xmm0, %xmm1
	leaq	-352(%rbp), %rdx
	divps	%xmm2, %xmm1
	xorps	%xmm0, %xmm0
	movlps	%xmm1, -352(%rbp)
	movlps	-288(%rbp), %xmm0
	movhps	%xmm1, -344(%rbp)
	movhps	-280(%rbp), %xmm0
	cmpq	%rdx, %rax
	divps	%xmm2, %xmm0
	movlps	%xmm0, -336(%rbp)
	movhps	%xmm0, -328(%rbp)
	xorps	%xmm0, %xmm0
	movlps	-272(%rbp), %xmm0
	movhps	-264(%rbp), %xmm0
	divps	%xmm2, %xmm0
	movlps	%xmm0, -320(%rbp)
	movhps	%xmm0, -312(%rbp)
	je	.L94
	xorps	%xmm0, %xmm0
	movlps	-352(%rbp), %xmm0
	movhps	-344(%rbp), %xmm0
	movlps	%xmm0, (%rax)
	movhps	%xmm0, 8(%rax)
	xorps	%xmm0, %xmm0
	movlps	-336(%rbp), %xmm0
	movhps	-328(%rbp), %xmm0
	movlps	%xmm0, 16(%rax)
	movhps	%xmm0, 24(%rax)
	xorps	%xmm0, %xmm0
	movlps	-320(%rbp), %xmm0
	movhps	-312(%rbp), %xmm0
	movlps	%xmm0, 32(%rax)
	movhps	%xmm0, 40(%rax)
	movq	-32(%rbp), %rax
	movq	%rax, -64(%rbp)
	movq	-24(%rbp), %rax
	movq	-64(%rbp), %xmm0
	movq	%rax, -56(%rbp)
	movd	%rax, %xmm1
	.p2align 4,,10
	.p2align 3
	movq	%r10, -64(%rbp)
	movq	%rsi, -56(%rbp)
	movd	%r10, %xmm0
	movd	%rsi, %xmm1

And here is the same thing without profiling:

	.cfi_personality 0x3,__gxx_personality_v0
	subq	$256, %rsp
	.cfi_def_cfa_offset 264
	leaq	16(%rdx), %r10
	movss	8(%rdx), %xmm1
	movss	4(%rdx), %xmm4
	movss	184(%rdi), %xmm2
	movss	188(%rdi), %xmm0
	movss	192(%rdi), %xmm3
	mulss	%xmm0, %xmm4
	mulss	%xmm3, %xmm1
	movq	264(%rsp), %rax
	addss	%xmm4, %xmm1
	movss	(%rdx), %xmm4
	mulss	%xmm2, %xmm4
	addss	%xmm4, %xmm1
	movss	%xmm1, 232(%rsp)
	movss	8(%r10), %xmm1
	movss	4(%r10), %xmm4
	mulss	%xmm3, %xmm1
	mulss	%xmm0, %xmm4
	leaq	32(%rdx), %r10
	addss	%xmm4, %xmm1
	movss	16(%rdx), %xmm4
	mulss	%xmm2, %xmm4
	addss	%xmm4, %xmm1
	movss	%xmm1, 236(%rsp)
	movss	8(%r10), %xmm1
	movss	4(%r10), %xmm4
	mulss	%xmm3, %xmm1
	mulss	%xmm0, %xmm4
	leaq	48(%rdx), %r10
	addss	%xmm4, %xmm1
	movss	32(%rdx), %xmm4
	mulss	%xmm2, %xmm4
	addss	%xmm4, %xmm1
	movss	%xmm1, 240(%rsp)
	mulss	8(%r10), %xmm3
	mulss	4(%r10), %xmm0
	mulss	48(%rdx), %xmm2
	addss	%xmm3, %xmm0
	xorps	%xmm3, %xmm3
	addss	%xmm2, %xmm0
	movss	180(%rdi), %xmm2
	movss	%xmm0, 244(%rsp)
	shufps	$0, %xmm2, %xmm2
	movaps	(%rsi), %xmm4
	xorps	%xmm0, %xmm0
	mulps	%xmm4, %xmm2
	movlps	232(%rsp), %xmm0
	movhps	240(%rsp), %xmm0
	movaps	%xmm0, %xmm1
	mulps	%xmm0, %xmm1
	subps	%xmm2, %xmm1
	movaps	%xmm3, %xmm2
	cmpltps	%xmm1, %xmm2
	movlps	%xmm2, 152(%rsp)
	movhps	%xmm2, 160(%rsp)
	movq	152(%rsp), %r10
	movq	160(%rsp), %rsi
	movd	%r10, %xmm5
	shufps	$0xe4, %xmm3, %xmm5
	movaps	%xmm5, %xmm2
	movd	%rsi, %xmm5
	movlhps	%xmm5, %xmm2
	movmskps	%xmm2, %r11d
	testl	%r11d, %r11d
	je	.L98
	sqrtps	%xmm1, %xmm1
	addps	%xmm0, %xmm1
	movaps	.LC10(%rip), %xmm0
	divps	%xmm4, %xmm1
	mulps	.LC9(%rip), %xmm1
	cmpltps	%xmm1, %xmm0
	movlps	%xmm0, 104(%rsp)
	movhps	%xmm0, 112(%rsp)
	movq	104(%rsp), %rsi
	movq	%rsi, 200(%rsp)
	movq	112(%rsp), %rsi
	movq	%rsi, 208(%rsp)
	andps	200(%rsp), %xmm2
	movlps	%xmm2, 56(%rsp)
	movhps	%xmm2, 64(%rsp)
	movq	56(%rsp), %r10
	movq	64(%rsp), %rsi
	movd	%r10, %xmm2
	movd	%rsi, %xmm5
	shufps	$0xe4, %xmm3, %xmm2
	movq	%r10, 216(%rsp)
	movaps	%xmm2, %xmm0
	movq	%rsi, 224(%rsp)
	movlhps	%xmm5, %xmm0
	movmskps	%xmm0, %r11d
	testl	%r11d, %r11d
	je	.L98
	mulps	%xmm1, %xmm4
	movaps	%xmm4, (%r9)
	mulps	(%rdx), %xmm1
	movss	(%rcx), %xmm0
	leaq	-8(%rsp), %rdx
	shufps	$0, %xmm0, %xmm0
	cmpq	%rdx, %r8
	addps	%xmm0, %xmm1
	je	.L87
	movaps	%xmm3, %xmm0
	movaps	%xmm1, (%r8)
	movlps	8(%rsp), %xmm0
	movlps	24(%rsp), %xmm3
	movhps	16(%rsp), %xmm0
	movhps	32(%rsp), %xmm3
	movlps	%xmm0, 16(%r8)
	movhps	%xmm0, 24(%r8)
	movlps	%xmm3, 32(%r8)
	movhps	%xmm3, 40(%r8)
	movss	8(%rdi), %xmm0
	movss	140(%rdi), %xmm2
	shufps	$0, %xmm0, %xmm0
	shufps	$0, %xmm2, %xmm2
	subps	%xmm0, %xmm1
	leaq	-104(%rsp), %rdx
	divps	%xmm2, %xmm1
	xorps	%xmm0, %xmm0
	movlps	%xmm1, -104(%rsp)
	movlps	-40(%rsp), %xmm0
	movhps	%xmm1, -96(%rsp)
	movhps	-32(%rsp), %xmm0
	cmpq	%rdx, %rax
	divps	%xmm2, %xmm0
	movlps	%xmm0, -88(%rsp)
	movhps	%xmm0, -80(%rsp)
	xorps	%xmm0, %xmm0
	movlps	-24(%rsp), %xmm0
	movhps	-16(%rsp), %xmm0
	divps	%xmm2, %xmm0
	movlps	%xmm0, -72(%rsp)
	movhps	%xmm0, -64(%rsp)
	je	.L94
	xorps	%xmm0, %xmm0
	movlps	-104(%rsp), %xmm0
	movhps	-96(%rsp), %xmm0
	movlps	%xmm0, (%rax)
	movhps	%xmm0, 8(%rax)
	xorps	%xmm0, %xmm0
	movlps	-88(%rsp), %xmm0
	movhps	-80(%rsp), %xmm0
	movlps	%xmm0, 16(%rax)
	movhps	%xmm0, 24(%rax)
	xorps	%xmm0, %xmm0
	movlps	-72(%rsp), %xmm0
	movhps	-64(%rsp), %xmm0
	movlps	%xmm0, 32(%rax)
	movhps	%xmm0, 40(%rax)
	movq	216(%rsp), %rax
	movq	%rax, 184(%rsp)
	movq	224(%rsp), %rax
	movq	184(%rsp), %xmm0
	movq	%rax, 192(%rsp)
	movd	%rax, %xmm1
	addq	$256, %rsp
	.p2align 4,,10
	.p2align 3
	movq	%r10, 184(%rsp)
	movq	%rsi, 192(%rsp)
	movd	%r10, %xmm0
	movd	%rsi, %xmm1
	addq	$256, %rsp

Tuomas Tonteri

