SSE SIMD enhanced code 4x slower than regular code

Boris Hollas <borish@xxxxxx> · Tue, 17 Jan 2012 23:34:12 -0800 (PST)

Hello,

I have a function iter1 that iterates a sequence of complex numbers. I
redesigned this function, using SSE intrinsics such as _mm_mul_pd, to obtain
iter0. Nonetheless, iter0 is 4x slower than iter1:

iter0 (with SSE intrinsics):
$ gcc -O -march=core2 t.c && time ./a.out
257829745

real	0m7.912s
user	0m7.908s
sys	0m0.000s

iter1 (w/o SSE intrinsics):
$ gcc -O -march=core2 t.c && time ./a.out
257829745

real	0m2.075s
user	0m2.076s
sys	0m0.000s

The size of a.out ist 7.1K in both cases. I use gcc version 4.4.5 and the
CPU is an Intel Core 2 Duo.

The code is below. iter0 and iter1 give the same numerical results.

#include <pmmintrin.h>
#include <stdio.h>
#define sqr(x) ((x)*(x))

typedef union {
  __m128d m;
  double v[2]; // v[0] low, v[1] up
} v2df;

int iter0(v2df z, v2df c, int n, int bound) {
  v2df z2, z2r, z2r_addsub, z_;
  z2.m = _mm_mul_pd(z.m, z.m);  // z_re^2, z_im^2
  z2r.v[1] = z2.v[0];
  z2r.v[0] = z2.v[1];
  z2r_addsub.m = _mm_addsub_pd(z2r.m, z2.m); // z_re^2 + z_im^2, z_re^2 -
z_im^2

  if(z2r_addsub.v[1] > 4.0 || n == bound) return n;
  else {
	z_.v[1] = z2r_addsub.v[0];
	z_.v[0] = 2.0 * z.v[1] * z.v[0];
	z_.m = _mm_add_pd(z_.m, c.m); // z_re^2 - z_im^2 + c_re, 2 * z_re * z_im +
c_im
	return iter0(z_, c, n+1, bound);
  }
}

int iter1(double z_re, double z_im, double c_re, double c_im, int n, int
bound) {
  double zre2 = sqr(z_re);
  double zim2 = sqr(z_im);

  if(zre2 + zim2 > 4.0 || n == bound) return n;
  else return iter1(zre2 - zim2 + c_re, 2.0 * z_re * z_im + c_im, c_re,
c_im, n+1, bound);
}

#define sse

int main() {
  v2df z, c;
  long n = 0;
  z.v[1] = 0.0; z.v[0] = 0.0;

  for(c.v[1] = -2.0; c.v[1] < 1.0; c.v[1] += 3.0/1000.0) {
    for(c.v[0] = -1.0; c.v[0] < 1.0; c.v[0] += 2.0/1000.0) {
#ifdef sse
  	  n += iter0(z, c, 0, 1000);
#else
	  n += iter1(0.0, 0.0, c.v[1], c.v[0], 0, 1000);
#endif
    }
  }
  printf("%ld\n", n);
  return 0;
}

-- 
View this message in context: http://old.nabble.com/SSE-SIMD-enhanced-code-4x-slower-than-regular-code-tp33159404p33159404.html
Sent from the gcc - Help mailing list archive at Nabble.com.