This is probably a common query, but I haven't managed to find any hints about what I'm doing wrong.
I'm trying to use the SIMD extensions to accelerate array arithmetic. My test code is below. I'm running gcc-3.3.3 on a pentium 4 3GHz, running Fedora Core 2.
My problem is that the SIMD code seems to be running slower than the optimized standard code. In fact if I turn on the optimization and cpu flag then I get a huge slowdown.
I can confirm with objdump that faddp instructions are being generated at least some of the time.
I've experimented with a few different compilers (only stable versions) but not achieved any consistent speed up.
I'd have thought that this was the simplest example to accelerate.
Am I doing something obvious wrong at the C level? Is there a particular compiler version that is known to do this sort of thing well?
I would appreciate any advice.
Here is the log of some test runs: ============================================================ Standardized arithmetic
19.41user 0.01system 0:19.44elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (0major+60minor)pagefaults 0swaps
Standard with optimization
cc -DDONORMAL -O2 -c -o vectrials.o vectrials.c cc -static vectrials.o -o vectrials
Standardized arithmetic
5.48user 0.00system 0:05.49elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (0major+60minor)pagefaults 0swaps ---------------------------- Vectorized without optimization
cc -DDOVEC -mcpu=pentium4 -c -o vectrials.o vectrials.c cc -static vectrials.o -o vectrials
Vectorized arithmetic
9.02user 0.00system 0:09.03elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (0major+60minor)pagefaults 0swaps
Vectorized with optimization
cc -DDOVEC -O2 -mcpu=pentium4 -c -o vectrials.o vectrials.c cc -static vectrials.o -o vectrials
Vectorized arithmetic
35.89user 0.03system 0:36.17elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (0major+58minor)pagefaults 0swaps ============================================================ And here is the code
#define _XOPEN_SOURCE 600 #include <errno.h> #include <stdlib.h> #include <stdio.h>
#define LEN 1000
#define THISTYPE float
/* typedef v8qi myvec; */ typedef int myvec __attribute__ ((mode(V4SF)));
#define myvecSize (sizeof(myvec)/sizeof(THISTYPE))
/**********************************************/
void * myalloc(size_t size) { /* alignement should be on 16 byte boundaries! */ const size_t align=2*sizeof(double); void *res=NULL; int status;
status = posix_memalign(&res, align, size); switch (status) { case EINVAL: fprintf(stderr, "Alignment parameter no good\n"); return NULL; break; case ENOMEM: fprintf(stderr, "Insufficient memory\n"); return NULL; default: return res; } }
/**********************************************/
void f1(myvec *in1,myvec *in2, myvec *out, int len) { int i; /* fprintf(stderr, "Vectorised length =%d\n", len); */
for (i=0;i<len;i++) { out[i] = in1[i] + in2[i]; }
}
/**********************************************/
void f2(THISTYPE *in1, THISTYPE *in2, THISTYPE *out, int len) { int i; /* fprintf(stderr, "Standard length =%d\n", len); */ for (i=0;i<len;i++) { out[i] = in1[i] + in2[i]; } }
/**********************************************/ void init(THISTYPE *I1, THISTYPE *I2, int len) { int i;
for (i=0;i<len;i++) { I1[i] = 34.0; I2[i] = 354.0; }
}
void check(THISTYPE *OO, int len) { fprintf(stderr, "First=%f, Last=%f\n", OO[0], OO[len-1]); }
#define TESTS 1000000
int main() { myvec *input1, *input2, *output; THISTYPE *I1, *I2, *OO; int tt;
/* fprintf(stderr, "(%d, %d, %d)\n", sizeof(double), sizeof(void *), sizeof(myvec)); */
input1 = (myvec *)myalloc(LEN * sizeof(myvec)); input2 = (myvec *)myalloc(LEN * sizeof(myvec)); output = (myvec *)myalloc(LEN * sizeof(myvec));
I1 = (THISTYPE *)input1; I2 = (THISTYPE *)input2; OO = (THISTYPE *)output;
init(I1, I2, LEN*sizeof(myvec)/sizeof(THISTYPE));
#ifdef DOVEC /* the vectorized one */ fprintf(stderr, "Vectorized arithmetic\n"); for (tt=0;tt<TESTS;tt++) { f1(input1, input2, output, LEN); } #endif
#ifdef DONORMAL fprintf(stderr, "Standardized arithmetic\n"); for (tt=0;tt<TESTS;tt++) { f2(I1, I2, OO, LEN * sizeof(myvec)/sizeof(THISTYPE)); } #endif check(OO, LEN * sizeof(myvec)/sizeof(THISTYPE)); return 0; }
-- Richard Beare, CSIRO Mathematical & Information Sciences Locked Bag 17, North Ryde, NSW 1670, Australia Phone: +61-2-93253221 (GMT+~10hrs) Fax: +61-2-93253200
Richard.Beare@xxxxxxxx