Hi Richard - With this kind of example you should definitely get about a 4 times speed up. One of your issues may be that gcc doesn't seem (I haven't confirmed this with anyone) to like to perform instruction scheduling on vector types. I have also seen similar slowdowns when using xmmintrin.h code if I code things naively. My advice: Try to write the code out long hand using the xmm intrinsics, interleaving loads and arithmetic, and see if you get a speed up. Can anyone confirm if gcc does sub-optimal instruction scheduling for vector types? Brian On Thu, 24 Feb 2005 11:19:04 +1100, Richard Beare <Richard.Beare@xxxxxxxx> wrote: > Hi Everyone, > This is probably a common query, but I haven't managed to find any hints > about what I'm doing wrong. > > I'm trying to use the SIMD extensions to accelerate array arithmetic. My > test code is below. I'm running gcc-3.3.3 on a pentium 4 3GHz, running > Fedora Core 2. > > My problem is that the SIMD code seems to be running slower than the > optimized standard code. In fact if I turn on the optimization and cpu > flag then I get a huge slowdown. > I can confirm with objdump that faddp instructions are being generated > at least some of the time. > > I've experimented with a few different compilers (only stable versions) > but not achieved any consistent speed up. > > I'd have thought that this was the simplest example to accelerate. > > Am I doing something obvious wrong at the C level? Is there a particular > compiler version that is known to do this sort of thing well? > > I would appreciate any advice. > > Here is the log of some test runs: > ============================================================ > Standardized arithmetic > > 19.41user 0.01system 0:19.44elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k > 0inputs+0outputs (0major+60minor)pagefaults 0swaps > > Standard with optimization > > cc -DDONORMAL -O2 -c -o vectrials.o vectrials.c > cc -static vectrials.o -o vectrials > > Standardized arithmetic > > 5.48user 0.00system 0:05.49elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k > 0inputs+0outputs (0major+60minor)pagefaults 0swaps > ---------------------------- > Vectorized without optimization > > cc -DDOVEC -mcpu=pentium4 -c -o vectrials.o vectrials.c > cc -static vectrials.o -o vectrials > > Vectorized arithmetic > > 9.02user 0.00system 0:09.03elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k > 0inputs+0outputs (0major+60minor)pagefaults 0swaps > > Vectorized with optimization > > cc -DDOVEC -O2 -mcpu=pentium4 -c -o vectrials.o vectrials.c > cc -static vectrials.o -o vectrials > > Vectorized arithmetic > > 35.89user 0.03system 0:36.17elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k > 0inputs+0outputs (0major+58minor)pagefaults 0swaps > ============================================================ > And here is the code > > #define _XOPEN_SOURCE 600 > #include <errno.h> > #include <stdlib.h> > #include <stdio.h> > > #define LEN 1000 > > #define THISTYPE float > > /* typedef v8qi myvec; */ > typedef int myvec __attribute__ ((mode(V4SF))); > > #define myvecSize (sizeof(myvec)/sizeof(THISTYPE)) > > /**********************************************/ > > void * myalloc(size_t size) > { > /* alignement should be on 16 byte boundaries! */ > const size_t align=2*sizeof(double); > void *res=NULL; > int status; > > status = posix_memalign(&res, align, size); > switch (status) { > case EINVAL: > fprintf(stderr, "Alignment parameter no good\n"); > return NULL; > break; > case ENOMEM: > fprintf(stderr, "Insufficient memory\n"); > return NULL; > default: > return res; > } > } > > /**********************************************/ > > void f1(myvec *in1,myvec *in2, myvec *out, int len) > { > int i; > /* fprintf(stderr, "Vectorised length =%d\n", len); */ > > for (i=0;i<len;i++) { > out[i] = in1[i] + in2[i]; > } > > } > > /**********************************************/ > > void f2(THISTYPE *in1, THISTYPE *in2, THISTYPE *out, int len) > { > int i; > /* fprintf(stderr, "Standard length =%d\n", len); */ > for (i=0;i<len;i++) { > out[i] = in1[i] + in2[i]; > } > } > > /**********************************************/ > void init(THISTYPE *I1, THISTYPE *I2, int len) > { > int i; > > for (i=0;i<len;i++) { > I1[i] = 34.0; > I2[i] = 354.0; > } > > } > > void check(THISTYPE *OO, int len) > { > fprintf(stderr, "First=%f, Last=%f\n", OO[0], OO[len-1]); > } > > #define TESTS 1000000 > > int main() > { > myvec *input1, *input2, *output; > THISTYPE *I1, *I2, *OO; > int tt; > > /* fprintf(stderr, "(%d, %d, %d)\n", sizeof(double), sizeof(void *), > sizeof(myvec)); */ > > input1 = (myvec *)myalloc(LEN * sizeof(myvec)); > input2 = (myvec *)myalloc(LEN * sizeof(myvec)); > output = (myvec *)myalloc(LEN * sizeof(myvec)); > > I1 = (THISTYPE *)input1; > I2 = (THISTYPE *)input2; > OO = (THISTYPE *)output; > > init(I1, I2, LEN*sizeof(myvec)/sizeof(THISTYPE)); > > #ifdef DOVEC > /* the vectorized one */ > fprintf(stderr, "Vectorized arithmetic\n"); > for (tt=0;tt<TESTS;tt++) { > f1(input1, input2, output, LEN); > } > #endif > > #ifdef DONORMAL > fprintf(stderr, "Standardized arithmetic\n"); > for (tt=0;tt<TESTS;tt++) { > f2(I1, I2, OO, LEN * sizeof(myvec)/sizeof(THISTYPE)); > } > #endif > check(OO, LEN * sizeof(myvec)/sizeof(THISTYPE)); > return 0; > } > > -- > Richard Beare, CSIRO Mathematical & Information Sciences > Locked Bag 17, North Ryde, NSW 1670, Australia > Phone: +61-2-93253221 (GMT+~10hrs) Fax: +61-2-93253200 > > Richard.Beare@xxxxxxxx >