Optimization flags, register allocation, parallelism

Greg Buchholz <gcc@xxxxxxxxxxxxxxxxxxxx> · Mon, 13 Feb 2006 18:57:45 -0800

/*
    In my quest to learn more about SSE programming on the x86, I
decided to implement one of the benchmarks from the "Great Computer
Language Shootout" ( http://shootout.alioth.debian.org/ ) using gcc's
vector extensions...

    http://gcc.gnu.org/onlinedocs/gcc-4.0.0/gcc/Vector-Extensions.html

...Specifically, the "partialsums" test looked like it might be ripe for
some speedups using the packed doubles of SSE2...

    http://shootout.alioth.debian.org/gp4/benchmark.php?test=partialsums

...The code below (see also http://sleepingsquirrel.org/cpp/psum_sse.c )
works, but I'm looking for some advice about selecting the right set of
compiler options to generate the assembly code I think should be
possible.  For example, at the -O1 level of optimization (with
gcc-3.4.2), I get some assembly output like...

#APP
    fsincos;
#NO_APP
    sqrtpd  -200(%ebp), %xmm0
    movapd  -152(%ebp), %xmm3
    divpd   %xmm0, %xmm3
    ...

...(grab the whole thing at: http://sleepingsquirrel.org/cpp/psum_sse-O1.s )
which looks nice, like I thought it would, with "fsincos" right before
the "sqrtpd", the theory being that they could execute in parallel.  But
one thing I noticed was that the XMM register optimization wasn't all
that good, there were more memory accesses and none of XMM4-7 were used.
Turning the optimization level up to -O2 and -O3 does a much better job
of register allocation (using all XMM registers), but the code starts to
look like ( http://sleepingsquirrel.org/cpp/psum_sse-O2.s )...

#APP
    fsincos;
#NO_APP
    fstpl   -464(%ebp)
    fxch    %st(1)
    movapd  %xmm2, %xmm1
    addpd   %xmm0, %xmm6
    ...

...with the "fstpl" instruction right after "fsincos".  I assume that
this causes us to wait for "fsincos" to finish executing, before we can
continue, and so we can't take advantage of inherent the parallelism
(maybe some x87 wizard can confim this?).  Anyway, on my machine (1.8GHz
Celeron), -O1 gives the fastest program (sadly, it isn't as fast as I
had initially hoped, I guess the SIMD hardware isn't that parallel after
all).  So I'm wondering, is there a combination of compiler flags that
would get me the nice register allocation without mangling the code
much?  Other optimization hints/suggestions would be appreciated. (Also
feel free to cuss me out if I'm micro-optimize something that's better
left to the compiler.)

Thanks, 

Greg Buchholz

*/

// Vectorized partialsums program for the 
// "Great Computer Language Shootout"
//  http://shootout.alioth.debian.org/gp4/benchmark.php?test=partialsums&lang=all
//
// Trying to figure out how to get the best combination of compiler
// options for gcc.
//
// example compiler options: 
//  -O1 -ffast-math -mfpmath=sse,387 -msse2
// -- vs. --
//  -O3 -ffast-math -mfpmath=sse,387 -msse2

#include<math.h>
#include<stdio.h>
#include<stdlib.h>
typedef double v2df __attribute__ ((vector_size (16)));

v2df make_vec(double a, double b)
{
    v2df v;
    double *tmp;
    tmp = (double *)&v; *(tmp) = a; *(tmp+1) = b;
    return v;
}

double sum_vec(v2df x)
{
    double *tmp = (double *)&x; 
    return *(tmp) + *(tmp+1);
}

void sincos_x87_inline(double x,double *s,double *c);
extern __inline__  void sincos_x87_inline(double x,double *s,double *c)
{
    __asm__ ("fsincos;" : "=t" (*c), "=u" (*s) : "0" (x) : "st(7)");
}

int main(int argc, char* argv[])
{
    double  twoThrd = 0, Flint = 0, Cookson = 0;
    v2df    Harmonic, zeta, poly, alt, Gregory, sqrts;
    v2df    zero, one, two, init, m_one, kv, av;

    double  k, k3, s, c, rt;
    int n;  n = atoi(argv[1]);

    zero  = make_vec( 0.0,  0.0);  one   = make_vec( 1.0,  1.0);
    two   = make_vec( 2.0,  2.0);  m_one = make_vec(-1.0, -1.0);
    init  = make_vec( 1.0,  2.0);  av    = make_vec( 1.0, -1.0);

    Harmonic = zeta = poly = alt = Gregory = sqrts = zero; 

    for (k=0.0; k<=n; ++k)
        twoThrd += pow(2.0/3.0, k);

    for (k=1.0,kv=init; k<=n; kv+=two, ++k)
    {
              k3 = k*k*k;
                   sincos_x87_inline(k,&s,&c); 
        Harmonic+= one / kv;
        Cookson += 1.0/(k3 * c*c);
        Flint   += 1.0/(k3 * s*s);        

        ++k;
              k3 = k*k*k;
                   sincos_x87_inline(k,&s,&c); 
        sqrts   += one / __builtin_ia32_sqrtpd(kv);
        Cookson += 1.0/(k3 * c*c);
        Flint   += 1.0/(k3 * s*s);        
    }
    for (k=1.0,kv=init;  *(double *)(&kv)<=n; kv+=two)
    {
        poly    += one /(kv*(kv+one)); 
        zeta    += one /(kv*kv);
        alt     +=  av / kv;  
        Gregory +=  av /(two*kv - one);
    }

#define show(name,num) printf("%.9f\t%s\n",num,name)
    show("(2/3)^k",           twoThrd); show("k^-0.5",      sum_vec(sqrts));
    show("1/k(k+1)",    sum_vec(poly)); show("Flint Hills", Flint);
    show("Cookson Hills",     Cookson); show("Harmonic", sum_vec(Harmonic));  
    show("Riemann Zeta",sum_vec(zeta)); show("Alternating Harmonic",sum_vec(alt));
    show("Gregory",  sum_vec(Gregory));

    return 0;
}