Re: [Gimp-developer] Solaris 64bit compile

Daniel Egger <egger@xxxxxxx> · 06 Sep 2001 17:20:18 +0200

Am 06 Sep 2001 10:19:28 +0200 schrieb =?ISO-8859-1?Q?Mattias
Engdeg=E5rd?=:

> This is completely portable, and should be a good deal faster than
> conditionally adding each component separately, at least on modern
> superscalar machines with expensive unpredicted branches. And benchmarks
> confirm this

I like it. I did some benchmarking with a few routines with different
compilers on ppc and i686 and here are the results:

egger@sonja:~/test > time ./testmat 
Time needed for padd_sat_4x8 in clocks: 5550000
Time needed for padd_sat_4x8_and in clocks: 6640000
Time needed for padd_sat_4x8_norm in clocks: 7090000

real	0m21.046s
user	0m18.680s
sys	0m0.650s

Options to compile:
/opt/experimental/bin/gcc -O3  -fssa  -save-temps test.c -o testmat 

egger@sonja:~/test > time ./testmat 
Time needed for padd_sat_4x8 in clocks: 5550000
Time needed for padd_sat_4x8_and in clocks: 5840000
Time needed for padd_sat_4x8_norm in clocks: 7840000
Time needed for padd_sat_4x8_vec in clocks: 1780000

real	0m21.477s
user	0m20.520s
sys	0m0.530s

Same machine but gcc-2.95.3 with Altivec support.
Options to compile:
/opt/gcc-altivec/bin/gcc -O3  -mcpu=7400 -fvec -save-temps test.c -o
testmat

egger@alex:~ > time ./testmat 
Time needed for padd_sat_4x8 in clocks: 8830000
Time needed for padd_sat_4x8_and in clocks: 10730000
Time needed for padd_sat_4x8_norm in clocks: 11010000

real	0m30.614s
user	0m30.370s
sys	0m0.210s

This machine is a Duron-800 with 1GB RAM. I've no idea why it performs
so poorly compared to the G4.

The compile was gcc 2.95.3 with -march=i686 and -mcpu=i686 however the
compiler didn't use the conditional move instructions from the higher
Pentium CPUs which should have sped up the _norm case considerable as it
is possible to do the same without branches.

The source is attached, feel free to study it and provide faster code.
At the moment it is pretty clear that Mattias code is pretty efficent
and compiler equally well with several compilers on several
architectures.

Servus,
       Daniel
#include <glib-1.2/glib.h>
#include <time.h>

static guint32 dest[20000000] __attribute__ ((aligned (16)));
static guint32 source1[20000000] __attribute__ ((aligned (16)));
static guint32 source2[20000000] __attribute__ ((aligned (16)));

inline void
padd_sat_4x8(guint32 *dest, guint32 *pa, guint32 *pb)
{
  guint32 a = *pa, b = *pb; 
  guint32 ta, tb, tm, q, u, m;
  /* save overflow-causing bits in ta, tb */
  ta = a & 0x80808080;
  tb = b & 0x80808080;
  q = a + b - (ta + tb);
  /* determine overflow conditions */
  tm = ta | tb;
  u = (ta & tb) | (q & tm);
  /* u now contains overflow bits, propagate them over fields */
  m = (u << 1) - (u >> 7);
  *dest = ((q + tm - u) | m);
}

inline void
padd_sat_4x8_norm (guint32 *dest, guint32 *pa, guint32 *pb)
{
  guint8 *newdest = (guint8 *) dest;
  guint16 dr, dg, db, da; 

  guint8 r1 = *((guint8 *) (pa) + 0);
  guint8 g1 = *((guint8 *) (pa) + 1);
  guint8 b1 = *((guint8 *) (pa) + 2);
  guint8 a1 = *((guint8 *) (pa) + 3);

  guint8 r2 = *((guint8 *) (pb) + 0);
  guint8 g2 = *((guint8 *) (pb) + 1);
  guint8 b2 = *((guint8 *) (pb) + 2);
  guint8 a2 = *((guint8 *) (pb) + 3);

  dr = r1 + r2;
  dg = g1 + g2;
  db = b1 + b2;
  da = a1 + a2;

  newdest[0] = dr > 255 ? 255 : dr; 
  newdest[1] = dg > 255 ? 255 : dg; 
  newdest[2] = db > 255 ? 255 : db; 
  newdest[3] = da > 255 ? 255 : da; 
}

inline void
padd_sat_4x8_and (guint32 *dest, guint32 *pa, guint32 *pb)
{
  guint32 s1 = *pa, s2 = *pb; 
  guint16 dr, dg, db, da; 
  guint8 *newdest = (guint8 *) dest;
  guint8 scratch; 

  dr = (s1 >> 24 ) & 0xff + (s2 >> 24) & 0xff; 
  dg = (s1 >> 16) & 0xff + (s2 >> 16) & 0xff; 
  db = (s1 >> 8) & 0xff + (s2 >> 8) & 0xff; 
  da = s1 & 0xff + s2 & 0xff; 

  newdest[0] = (guint8) (~((dr >> 8) - 1)) | dr;
  newdest[1] = (guint8) (~((dg >> 8) - 1)) | dg;
  newdest[2] = (guint8) (~((db >> 8) - 1)) | db;
  newdest[3] = (guint8) (~((da >> 8) - 1)) | da;
}

#ifdef __VEC__
inline void
padd_sat_4x8_vec (guint32 *dest, guint32 *pa, guint32 *pb)
{
  vector unsigned char vdest, source1, source2;
  source1 = vec_ld (0, (unsigned char *) pa);
  source2 = vec_ld (0, (unsigned char *) pb);
  vdest = vec_adds (source1, source2);
  vec_st (vdest, 0, (unsigned char *) dest);
}
#endif

int
main (void)
{
  int i, current, iter;

  current = clock ();
  for (iter = 0; iter < 10; iter++)
  {
    for (i = 0; i < 20000000; i++)
    {
      padd_sat_4x8 (dest + i, source1 + i, source2 + i);
    }
  }

  current = clock () - current;
  printf("Time needed for padd_sat_4x8 in clocks: %i\n", current);

  current = clock ();
  for (iter = 0; iter < 10; iter++)
  {
    for (i = 0; i < 20000000; i++)
    {
      padd_sat_4x8_and (dest + i, source1 + i, source2 + i);
    }
  }

  current = clock () - current;
  printf("Time needed for padd_sat_4x8_and in clocks: %i\n", current);

  current = clock ();
  for (iter = 0; iter < 10; iter++)
  {
    for (i = 0; i < 20000000; i++)
    {
      padd_sat_4x8_norm (dest + i, source1 + i, source2 + i);
    }
  }

  current = clock () - current;
  printf("Time needed for padd_sat_4x8_norm in clocks: %i\n", current);

#ifdef __VEC__
  current = clock ();
  for (iter = 0; iter < 10; iter++)
  {
    for (i = 0; i < 20000000; i+=4)
    {
      padd_sat_4x8_vec (dest + i, source1 + i, source2 + i);
    }
  }

  current = clock () - current;
  printf("Time needed for padd_sat_4x8_vec in clocks: %i\n", current);
#endif
}