Advice about using SIMD extensions

Richard Beare <Richard.Beare@xxxxxxxx> · Thu, 24 Feb 2005 11:19:04 +1100

Hi Everyone,

This is probably a common query, but I haven't managed to find any hints 
about what I'm doing wrong.

I'm trying to use the SIMD extensions to accelerate array arithmetic. My 
test code is below. I'm running gcc-3.3.3 on a pentium 4 3GHz, running 
Fedora Core 2.

My problem is that the SIMD code seems to be running slower than the 
optimized standard code. In fact if I turn on the optimization and cpu 
flag then I get a huge slowdown.

I can confirm with objdump that faddp instructions are being generated 
at least some of the time.

I've experimented with a few different compilers (only stable versions) 
but not achieved any consistent speed up.

I'd have thought that this was the simplest example to accelerate.

Am I doing something obvious wrong at the C level? Is there a particular 
compiler version that is known to do this sort of thing well?

I would appreciate any advice.

Here is the log of some test runs:
============================================================
Standardized arithmetic

19.41user 0.01system 0:19.44elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+60minor)pagefaults 0swaps

Standard with optimization

cc -DDONORMAL -O2    -c -o vectrials.o vectrials.c
cc -static  vectrials.o   -o vectrials

Standardized arithmetic

5.48user 0.00system 0:05.49elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+60minor)pagefaults 0swaps
----------------------------
Vectorized without optimization

cc -DDOVEC -mcpu=pentium4    -c -o vectrials.o vectrials.c
cc -static  vectrials.o   -o vectrials

Vectorized arithmetic

9.02user 0.00system 0:09.03elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+60minor)pagefaults 0swaps

Vectorized with optimization

cc -DDOVEC -O2 -mcpu=pentium4    -c -o vectrials.o vectrials.c
cc -static  vectrials.o   -o vectrials

Vectorized arithmetic

35.89user 0.03system 0:36.17elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+58minor)pagefaults 0swaps
============================================================
And here is the code

#define _XOPEN_SOURCE 600
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>

#define LEN 1000

#define THISTYPE float

/* typedef v8qi myvec; */
typedef int myvec __attribute__ ((mode(V4SF)));

#define myvecSize (sizeof(myvec)/sizeof(THISTYPE))

/**********************************************/

void * myalloc(size_t size)
{
  /* alignement should be on 16 byte boundaries! */
  const size_t align=2*sizeof(double);
  void *res=NULL;
  int status;

  status = posix_memalign(&res, align, size);
  switch (status) {
  case EINVAL:
    fprintf(stderr, "Alignment parameter no good\n");
    return NULL;
    break;
  case ENOMEM:
    fprintf(stderr, "Insufficient memory\n");
    return NULL;
  default:
    return res;
  }
}

/**********************************************/

void f1(myvec *in1,myvec *in2, myvec *out, int len)
{
  int i;
/*   fprintf(stderr, "Vectorised length =%d\n", len); */

  for (i=0;i<len;i++) {
    out[i] = in1[i] + in2[i];
  }

}

/**********************************************/

void f2(THISTYPE *in1, THISTYPE *in2, THISTYPE *out, int len)
{
  int i;
/*   fprintf(stderr, "Standard length =%d\n", len); */
  for (i=0;i<len;i++) {
    out[i] = in1[i] + in2[i];
  }
}

/**********************************************/
void init(THISTYPE *I1, THISTYPE *I2, int len)
{
  int i;

  for (i=0;i<len;i++) {
    I1[i] = 34.0;
    I2[i] = 354.0;
  }

}

void check(THISTYPE *OO, int len)
{
  fprintf(stderr, "First=%f, Last=%f\n", OO[0], OO[len-1]);
}

#define TESTS 1000000

int main()
{
  myvec *input1, *input2, *output;
  THISTYPE *I1, *I2, *OO;
  int tt;

/*   fprintf(stderr, "(%d, %d, %d)\n", sizeof(double), sizeof(void *), 
sizeof(myvec)); */

  input1 = (myvec *)myalloc(LEN * sizeof(myvec));
  input2 = (myvec *)myalloc(LEN * sizeof(myvec));
  output = (myvec *)myalloc(LEN * sizeof(myvec));

  I1 = (THISTYPE *)input1;
  I2 = (THISTYPE *)input2;
  OO = (THISTYPE *)output;

  init(I1, I2, LEN*sizeof(myvec)/sizeof(THISTYPE));

#ifdef DOVEC
  /* the vectorized one */
  fprintf(stderr, "Vectorized arithmetic\n");
  for (tt=0;tt<TESTS;tt++) {
    f1(input1, input2, output, LEN);
  }
#endif

#ifdef DONORMAL
  fprintf(stderr, "Standardized arithmetic\n");
  for (tt=0;tt<TESTS;tt++) {
    f2(I1, I2, OO, LEN * sizeof(myvec)/sizeof(THISTYPE));
  }
#endif
  check(OO, LEN * sizeof(myvec)/sizeof(THISTYPE));
  return 0;
}

--
Richard Beare, CSIRO Mathematical & Information Sciences
Locked Bag 17, North Ryde, NSW 1670, Australia
Phone: +61-2-93253221 (GMT+~10hrs)  Fax: +61-2-93253200

Richard.Beare@xxxxxxxx