Re: inline asm mmx: how to movq from memory to %mmX

"Jack Andrews" <effbiae@xxxxxxxxx> · Fri, 20 Apr 2007 07:50:21 +1000

<sent to ian taylor in error>:

You are missing commas between the operands.
    movq %0, %%mm1

thanks for that.

Instead of using inline assembler I recommend that you use the
functions in mmintrin.h.

i've written a program (appended below) that sums a list of 32 bit
integers.  there are 3 functions that do exactly the same thing (sum
the list) and they get very different timings.

i have a script RUNME.sh:

$ sh RUNME.sh
---
expect: 199990000
impl: C (SISD)
199990000

real    0m0.604s
user    0m0.580s
sys     0m0.004s

---
expect: 199990000
impl: ASM (SIMD)
199990000

real    0m0.377s
user    0m0.360s
sys     0m0.008s

---
expect: 199990000
impl: MMINTRIN (SIMD)
199990000

real    0m1.235s
user    0m1.228s
sys     0m0.004s

so mmintr functions are slower than plain C and my assembly function
is faster than the others.  am i using mmintr correctly?

jack

$ cat RUNME.sh
#!/bin/sh
repeats=4000        # number of times to repeat the test
vectorsize=10000   # size of the vector in 32 bit ints
gcc -O -mmmx v.c -o v
for which in 0 1 2; do time ./v $repeats $vectorsize $which; done

$ cat v.c
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <mmintrin.h>

typedef long I;typedef unsigned long J;
typedef char C;
#define IZ sizeof(I)
#define W 2

simd_mmintrin(n, is)
I *is;
{   __v2si q,r;
  I i;
  _m_empty();
  q=_m_from_int(0);
  for (i=0; i < n; i+=W) {
      memcpy(&r,is+i,IZ*W);
      q=_m_paddd(q,r);
  }
  I*qq=(I*)&q;
  return qq[0]+qq[1];
}

simd_asm(n, is)
I *is;
{   I i,*r=malloc(IZ*W*8);
  asm("emms");
  asm("pxor %mm0,%mm0");
  for (i=0; i < n; i+=W) {
      asm("movq %0,%%mm1\n\t"
          "paddd %%mm1,%%mm0"
          :
          :"m"(is[i])           );
  }
  asm("movq %%mm0,%0":"=m"(*(__m64*)r));
  return r[0]+r[1];
}

sisd(n, is)
I *is;
{
  I i = 0, j = 0;
  for (i = 0; i < n; i++)
      j += is[i];
  return j;
}

main(c, v)
C **v;
{
  I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]);
  I result, *is=malloc(IZ*(z*=2)), i;
  int(*fs[])()={sisd,simd_asm,simd_mmintrin,0};
  C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"};
  for(i=0;i<z;i++)is[i]=i;
  printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2);
  printf("impl: %s\n",ss[m]);
  while (n--)
      result=fs[m](z, is);
  printf("%d\n",result);
}