Ian wrote:
For mmintrin.h functions, use __m64, not __v2si. Why the memcpy? Use _mm_set_pi32(is[i], is[i + 1]). Don't extract the values by taking the address of q. Instead do something like this: union { long ai[2]; __m64 m } u; u.m = q; return u.ai[0] + u.ai[1]; Ian
followed this and now the mmintrin version runs even slower! my RUNME.sh stays the same, and my v.c is now: #include <string.h> #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <mmintrin.h> typedef long I;typedef unsigned long J; typedef char C; #define IZ sizeof(I) #define W 2 simd_mmintrin(n, is) I *is; { __m64 q,r; I i; _m_empty(); q=_m_from_int(0); for (i=0; i < n; i+=W) { r=_mm_set_pi32(is[i],is[i+1]); q=_m_paddd(q,r); } union {long a[2];__m64 m;}u; u.m=q; return u.a[0]+u.a[1]; } simd_asm(n, is) I *is; { I i,*r=malloc(IZ*W*8); asm("emms"); asm("pxor %mm0,%mm0"); for (i=0; i < n; i+=W) { asm("movq %0,%%mm1\n\t" "paddd %%mm1,%%mm0" : :"m"(is[i]) ); } asm("movq %%mm0,%0":"=m"(*(__m64*)r)); return r[0]+r[1]; } sisd(n, is) I *is; { I i = 0, j = 0; for (i = 0; i < n; i++) j += is[i]; return j; } main(c, v) C **v; { I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]); I result, *is=malloc(IZ*(z*=2)), i; int(*fs[])()={sisd,simd_asm,simd_mmintrin,0}; C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"}; for(i=0;i<z;i++)is[i]=i; printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2); printf("impl: %s\n",ss[m]); while (n--) result=fs[m](z, is); printf("%d\n",result); }