<sent to ian taylor in error>:
You are missing commas between the operands. movq %0, %%mm1
thanks for that.
Instead of using inline assembler I recommend that you use the functions in mmintrin.h.
i've written a program (appended below) that sums a list of 32 bit integers. there are 3 functions that do exactly the same thing (sum the list) and they get very different timings. i have a script RUNME.sh: $ sh RUNME.sh --- expect: 199990000 impl: C (SISD) 199990000 real 0m0.604s user 0m0.580s sys 0m0.004s --- expect: 199990000 impl: ASM (SIMD) 199990000 real 0m0.377s user 0m0.360s sys 0m0.008s --- expect: 199990000 impl: MMINTRIN (SIMD) 199990000 real 0m1.235s user 0m1.228s sys 0m0.004s so mmintr functions are slower than plain C and my assembly function is faster than the others. am i using mmintr correctly? jack $ cat RUNME.sh #!/bin/sh repeats=4000 # number of times to repeat the test vectorsize=10000 # size of the vector in 32 bit ints gcc -O -mmmx v.c -o v for which in 0 1 2; do time ./v $repeats $vectorsize $which; done $ cat v.c #include <string.h> #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <mmintrin.h> typedef long I;typedef unsigned long J; typedef char C; #define IZ sizeof(I) #define W 2 simd_mmintrin(n, is) I *is; { __v2si q,r; I i; _m_empty(); q=_m_from_int(0); for (i=0; i < n; i+=W) { memcpy(&r,is+i,IZ*W); q=_m_paddd(q,r); } I*qq=(I*)&q; return qq[0]+qq[1]; } simd_asm(n, is) I *is; { I i,*r=malloc(IZ*W*8); asm("emms"); asm("pxor %mm0,%mm0"); for (i=0; i < n; i+=W) { asm("movq %0,%%mm1\n\t" "paddd %%mm1,%%mm0" : :"m"(is[i]) ); } asm("movq %%mm0,%0":"=m"(*(__m64*)r)); return r[0]+r[1]; } sisd(n, is) I *is; { I i = 0, j = 0; for (i = 0; i < n; i++) j += is[i]; return j; } main(c, v) C **v; { I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]); I result, *is=malloc(IZ*(z*=2)), i; int(*fs[])()={sisd,simd_asm,simd_mmintrin,0}; C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"}; for(i=0;i<z;i++)is[i]=i; printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2); printf("impl: %s\n",ss[m]); while (n--) result=fs[m](z, is); printf("%d\n",result); }