Re: inline asm mmx: how to movq from memory to %mmX

"Jack Andrews" <effbiae@xxxxxxxxx> · Sat, 21 Apr 2007 19:15:14 +1000

Ian wrote:
For mmintrin.h functions, use __m64, not __v2si.

Why the memcpy?  Use _mm_set_pi32(is[i], is[i + 1]).

Don't extract the values by taking the address of q.  Instead do
something like this:
   union { long ai[2]; __m64 m } u;
   u.m = q;
   return u.ai[0] + u.ai[1];

Ian

followed this and now the mmintrin version runs even slower!

my RUNME.sh stays the same, and my v.c is now:

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <mmintrin.h>

typedef long I;typedef unsigned long J;
typedef char C;
#define IZ sizeof(I)
#define W 2

simd_mmintrin(n, is)
I *is;
{   __m64 q,r;
   I i;
   _m_empty();
   q=_m_from_int(0);
   for (i=0; i < n; i+=W) {
       r=_mm_set_pi32(is[i],is[i+1]);
       q=_m_paddd(q,r);
   }
   union {long a[2];__m64 m;}u;
   u.m=q;
   return u.a[0]+u.a[1];
}

simd_asm(n, is)
I *is;
{   I i,*r=malloc(IZ*W*8);
   asm("emms");
   asm("pxor %mm0,%mm0");
   for (i=0; i < n; i+=W) {
       asm("movq %0,%%mm1\n\t"
           "paddd %%mm1,%%mm0"
           :
           :"m"(is[i])           );
   }
   asm("movq %%mm0,%0":"=m"(*(__m64*)r));
   return r[0]+r[1];
}

sisd(n, is)
I *is;
{
   I i = 0, j = 0;
   for (i = 0; i < n; i++)
	j += is[i];
   return j;
}

main(c, v)
C **v;
{
   I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]);
   I result, *is=malloc(IZ*(z*=2)), i;
   int(*fs[])()={sisd,simd_asm,simd_mmintrin,0};
   C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"};
   for(i=0;i<z;i++)is[i]=i;
   printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2);
   printf("impl: %s\n",ss[m]);
   while (n--)
	result=fs[m](z, is);
   printf("%d\n",result);
}