/* In my quest to learn more about SSE programming on the x86, I decided to implement one of the benchmarks from the "Great Computer Language Shootout" ( http://shootout.alioth.debian.org/ ) using gcc's vector extensions... http://gcc.gnu.org/onlinedocs/gcc-4.0.0/gcc/Vector-Extensions.html ...Specifically, the "partialsums" test looked like it might be ripe for some speedups using the packed doubles of SSE2... http://shootout.alioth.debian.org/gp4/benchmark.php?test=partialsums ...The code below (see also http://sleepingsquirrel.org/cpp/psum_sse.c ) works, but I'm looking for some advice about selecting the right set of compiler options to generate the assembly code I think should be possible. For example, at the -O1 level of optimization (with gcc-3.4.2), I get some assembly output like... #APP fsincos; #NO_APP sqrtpd -200(%ebp), %xmm0 movapd -152(%ebp), %xmm3 divpd %xmm0, %xmm3 ... ...(grab the whole thing at: http://sleepingsquirrel.org/cpp/psum_sse-O1.s ) which looks nice, like I thought it would, with "fsincos" right before the "sqrtpd", the theory being that they could execute in parallel. But one thing I noticed was that the XMM register optimization wasn't all that good, there were more memory accesses and none of XMM4-7 were used. Turning the optimization level up to -O2 and -O3 does a much better job of register allocation (using all XMM registers), but the code starts to look like ( http://sleepingsquirrel.org/cpp/psum_sse-O2.s )... #APP fsincos; #NO_APP fstpl -464(%ebp) fxch %st(1) movapd %xmm2, %xmm1 addpd %xmm0, %xmm6 ... ...with the "fstpl" instruction right after "fsincos". I assume that this causes us to wait for "fsincos" to finish executing, before we can continue, and so we can't take advantage of inherent the parallelism (maybe some x87 wizard can confim this?). Anyway, on my machine (1.8GHz Celeron), -O1 gives the fastest program (sadly, it isn't as fast as I had initially hoped, I guess the SIMD hardware isn't that parallel after all). So I'm wondering, is there a combination of compiler flags that would get me the nice register allocation without mangling the code much? Other optimization hints/suggestions would be appreciated. (Also feel free to cuss me out if I'm micro-optimize something that's better left to the compiler.) Thanks, Greg Buchholz */ // Vectorized partialsums program for the // "Great Computer Language Shootout" // http://shootout.alioth.debian.org/gp4/benchmark.php?test=partialsums&lang=all // // Trying to figure out how to get the best combination of compiler // options for gcc. // // example compiler options: // -O1 -ffast-math -mfpmath=sse,387 -msse2 // -- vs. -- // -O3 -ffast-math -mfpmath=sse,387 -msse2 #include<math.h> #include<stdio.h> #include<stdlib.h> typedef double v2df __attribute__ ((vector_size (16))); v2df make_vec(double a, double b) { v2df v; double *tmp; tmp = (double *)&v; *(tmp) = a; *(tmp+1) = b; return v; } double sum_vec(v2df x) { double *tmp = (double *)&x; return *(tmp) + *(tmp+1); } void sincos_x87_inline(double x,double *s,double *c); extern __inline__ void sincos_x87_inline(double x,double *s,double *c) { __asm__ ("fsincos;" : "=t" (*c), "=u" (*s) : "0" (x) : "st(7)"); } int main(int argc, char* argv[]) { double twoThrd = 0, Flint = 0, Cookson = 0; v2df Harmonic, zeta, poly, alt, Gregory, sqrts; v2df zero, one, two, init, m_one, kv, av; double k, k3, s, c, rt; int n; n = atoi(argv[1]); zero = make_vec( 0.0, 0.0); one = make_vec( 1.0, 1.0); two = make_vec( 2.0, 2.0); m_one = make_vec(-1.0, -1.0); init = make_vec( 1.0, 2.0); av = make_vec( 1.0, -1.0); Harmonic = zeta = poly = alt = Gregory = sqrts = zero; for (k=0.0; k<=n; ++k) twoThrd += pow(2.0/3.0, k); for (k=1.0,kv=init; k<=n; kv+=two, ++k) { k3 = k*k*k; sincos_x87_inline(k,&s,&c); Harmonic+= one / kv; Cookson += 1.0/(k3 * c*c); Flint += 1.0/(k3 * s*s); ++k; k3 = k*k*k; sincos_x87_inline(k,&s,&c); sqrts += one / __builtin_ia32_sqrtpd(kv); Cookson += 1.0/(k3 * c*c); Flint += 1.0/(k3 * s*s); } for (k=1.0,kv=init; *(double *)(&kv)<=n; kv+=two) { poly += one /(kv*(kv+one)); zeta += one /(kv*kv); alt += av / kv; Gregory += av /(two*kv - one); } #define show(name,num) printf("%.9f\t%s\n",num,name) show("(2/3)^k", twoThrd); show("k^-0.5", sum_vec(sqrts)); show("1/k(k+1)", sum_vec(poly)); show("Flint Hills", Flint); show("Cookson Hills", Cookson); show("Harmonic", sum_vec(Harmonic)); show("Riemann Zeta",sum_vec(zeta)); show("Alternating Harmonic",sum_vec(alt)); show("Gregory", sum_vec(Gregory)); return 0; }