Performance regression between 4.7.2 and 4.8.1 for shootout's nbody.c

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello Everyone,

I found an odd performance regression going from 4.7.2 to 4.8.1, and I
wanted to make sure that it is not a problem on my end.  Specifically,
if I compile the nbody.c code from

  http://benchmarksgame.alioth.debian.org/u32/program.php?test=nbody&lang=gcc&id=4

(also attached) with a command like

  gcc -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse3 nbody_c.c -o nbody_c -lm

then run "/usr/bin/time nbody_c 50000000" 4 times, I get the results (elapsed time)

4.7.2: 4.42 4.44 4.42 4.43
4.8.1: 4.71 4.71 4.70 4.71

So 4.8.1 is about 5% slower than 4.7.2.  Has anyone else seen this result?

I am getting my compilers from Debian.

Thanks,
Walter Landry
wlandry@xxxxxxxxxxx
/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   contributed by Mark C. Lewis
   modified slightly by Chad Whipkey
   converted from java to c++,added sse support, by Branimir Maksimovic
   converted from c++ to c, by Alexey Medvedchikov 
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <immintrin.h>

#define PI 3.141592653589793
#define SOLAR_MASS ( 4 * PI * PI )
#define DAYS_PER_YEAR 365.24

struct body {
   double x[3], fill, v[3], mass;
};

static struct body solar_bodies[] = {
   /* sun */
   {
      .x = { 0., 0., 0. },
      .v = { 0., 0., 0. },
      .mass = SOLAR_MASS
   },
   /* jupiter */
   {
      .x = { 4.84143144246472090e+00,
         -1.16032004402742839e+00,
         -1.03622044471123109e-01 },
      .v = { 1.66007664274403694e-03 * DAYS_PER_YEAR,
         7.69901118419740425e-03 * DAYS_PER_YEAR,
         -6.90460016972063023e-05 * DAYS_PER_YEAR },
      .mass = 9.54791938424326609e-04 * SOLAR_MASS
   },
   /* saturn */
   {
      .x = { 8.34336671824457987e+00,
         4.12479856412430479e+00,
         -4.03523417114321381e-01 },
      .v = { -2.76742510726862411e-03 * DAYS_PER_YEAR,
         4.99852801234917238e-03 * DAYS_PER_YEAR,
         2.30417297573763929e-05 * DAYS_PER_YEAR },
      .mass = 2.85885980666130812e-04 * SOLAR_MASS
   },
   /* uranus */
   {
      .x = { 1.28943695621391310e+01,
         -1.51111514016986312e+01,
         -2.23307578892655734e-01 },
      .v = { 2.96460137564761618e-03 * DAYS_PER_YEAR,
         2.37847173959480950e-03 * DAYS_PER_YEAR,
         -2.96589568540237556e-05 * DAYS_PER_YEAR },
      .mass = 4.36624404335156298e-05 * SOLAR_MASS
   },
   /* neptune */
   {
      .x = { 1.53796971148509165e+01,
         -2.59193146099879641e+01,
         1.79258772950371181e-01 },
      .v = { 2.68067772490389322e-03 * DAYS_PER_YEAR,
         1.62824170038242295e-03 * DAYS_PER_YEAR,
         -9.51592254519715870e-05 * DAYS_PER_YEAR },
      .mass = 5.15138902046611451e-05 * SOLAR_MASS
   }
};

static const int BODIES_SIZE = sizeof(solar_bodies) / sizeof(solar_bodies[0]);

void offset_momentum(struct body *bodies, unsigned int nbodies)
{
   unsigned int i, k;
   for (i = 0; i < nbodies; ++i)
      for (k = 0; k < 3; ++k)
         bodies[0].v[k] -= bodies[i].v[k] * bodies[i].mass
            / SOLAR_MASS;
}

void bodies_advance(struct body *bodies, unsigned int nbodies, double dt)
{
   unsigned int N = (nbodies - 1) * nbodies / 2;
   static struct {
      double dx[3], fill;
   } r[1000];
   static __attribute__((aligned(16))) double mag[1000];
   unsigned int i, j, k, m;
   __m128d dx[3], dsquared, distance, dmag;

   for(k = 0, i = 0; i < nbodies - 1; ++i)
      for(j = i + 1; j < nbodies; ++j, ++k)
         for ( m = 0; m < 3; ++m)
            r[k].dx[m] = bodies[i].x[m] - bodies[j].x[m];

   for (i = 0; i < N; i += 2) {
      for (m = 0; m < 3; ++m) {
         dx[m] = _mm_loadl_pd(dx[m], &r[i].dx[m]);
         dx[m] = _mm_loadh_pd(dx[m], &r[i+1].dx[m]);
      }

      dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
      distance = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared)));

      for (j = 0; j < 2; ++j)
         distance = distance * _mm_set1_pd(1.5)
            - ((_mm_set1_pd(0.5) * dsquared) * distance)
            * (distance * distance);

      dmag = _mm_set1_pd(dt) / (dsquared) * distance;
      _mm_store_pd(&mag[i], dmag);
   }

   for (i = 0, k = 0; i < nbodies - 1; ++i)
      for ( j = i + 1; j < nbodies; ++j, ++k)
         for ( m = 0; m < 3; ++m) {
            bodies[i].v[m] -= r[k].dx[m] * bodies[j].mass
               * mag[k];
            bodies[j].v[m] += r[k].dx[m] * bodies[i].mass
               * mag[k];
         }

   for (i = 0; i < nbodies; ++i)
      for ( m = 0; m < 3; ++m)
         bodies[i].x[m] += dt * bodies[i].v[m];
}

double bodies_energy(struct body *bodies, unsigned int nbodies) {
   double dx[3], distance, e = 0.0;
   unsigned int i, j, k;

   for (i=0; i < nbodies; ++i) {
      e += bodies[i].mass * ( bodies[i].v[0] * bodies[i].v[0]
         + bodies[i].v[1] * bodies[i].v[1]
         + bodies[i].v[2] * bodies[i].v[2] ) / 2.;

      for (j=i+1; j < nbodies; ++j) {
         for (k = 0; k < 3; ++k)
            dx[k] = bodies[i].x[k] - bodies[j].x[k];

         distance = sqrt(dx[0] * dx[0] + dx[1] * dx[1] 
            + dx[2] * dx[2]);
         e -= (bodies[i].mass * bodies[j].mass) / distance;
      }
   }
   return e;
}

int main(int argc, char** argv)
{
   int i, n = atoi(argv[1]);
   offset_momentum(solar_bodies, BODIES_SIZE);
   printf("%.9f\n", bodies_energy(solar_bodies, BODIES_SIZE));
   for (i = 0; i < n; ++i)
      bodies_advance(solar_bodies, BODIES_SIZE, 0.01);
   printf("%.9f\n", bodies_energy(solar_bodies, BODIES_SIZE));
   return 0;
}

[Index of Archives]     [Linux C Programming]     [Linux Kernel]     [eCos]     [Fedora Development]     [Fedora Announce]     [Autoconf]     [The DWARVES Debugging Tools]     [Yosemite Campsites]     [Yosemite News]     [Linux GCC]

  Powered by Linux