Hi - I think this might be a little simpler: void test_add_long1(long * result, long * a, long * b, long size) { __m128i *A = (__m128i*)a; __m128i *B = (__m128i*)b; __m128i *end = A + size/2; __m128i *R = (__m128i*)result; for(; A < end; ++A, ++B, ++R) { *R = _mm_add_epi64(*A, *B); } } though I believe if you let the the compiler know about the alignment of result, a, and b, it will properly optimize this: void test_add_long2(long * result, long * a, long * b, long size) { long *end = a + size; for(; a < end; ++a, ++b, ++result) { *result = *a + *b; } } Brian On Thu, Mar 13, 2008 at 5:12 PM, JP Fournier <jape41@xxxxxxxxx> wrote: > Brian Dessent wrote: > > > > You're violates the C aliasing rules. You can't store through a casted > > pointer like that. You also don't have to do the load/store, the > > compiler know what you want when you use a union instead: > > > > union { __m128i v; long l[2]; } a, b, c; > > > > a.l[0] = a.l[1] = 1; > > b.l[0] = b.l[1] = 1; > > > > c.v = _mm_add_epi8 (a.v, b.v); > > printf("c0=%ld c1=%ld\n", c.l[0], c.l[1]); > > Many Thanks Brian. My little program now behaves better: > > bash-3.1$ gcc -O2 -msse2 -o sse2 sse2-1.c > > bash-3.1$ ./sse2 > c0=2 c1=2 > > > #include <stdio.h> > #include <emmintrin.h> > > void test_int() { > > union { __m128i v; long l[2]; } a, b, c; > > a.l[0] = a.l[1] = 1; > b.l[0] = b.l[1] = 1; > c.l[0] = c.l[1] = 0; > > c.v = _mm_add_epi8( a.v, b.v ); > printf("c0=%ld c1=%ld\n", c.l[0], c.l[1] ); > } > > int main( int count, char ** args ) { > test_int(); > return 0; > } > > > > > > > There's an even more natural way to do this though using gcc's built-in > > vector extensions without any of the Intel mmintrin.h stuff. This way > > will result in code that will vectorize to altivec, sse2, spu, whatever > > the machine supports, it's not hardware specific: > > > > typedef int v4si __attribute__ ((vector_size (16))); > > > > v4si a = { 1, 2, 3, 4 }, b = { 5, 6, 7, 8 }, c; > > > > c = a + b; > > > > You can use all the normal C operators like + and * as if they were > > scalars but they will be compiled using the corresponding SIMD > > instructions. See > > <http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html> for more. If > > you want access to the individual parts you can again use the union, > > My thinking is that I'd like try to be compiler independent, so by using > the intel intrinsics I figure I should be able to get gcc and the intel > compiler to work as a start. > > What I am _really_ trying to do is to implement is the addition of > elements of two arrays. > > Is there a more efficient way of doing this than this way?: > > > > #include <stdio.h> > #include <emmintrin.h> > > void test_add_long(long * result, long * a, long * b, long size) { > union { __m128i v; long l[2]; } temp1, temp2, temp3; > int index=0; > > for( index=0; index < size; index+=2 ) { > temp1.l[0] = a[index]; > temp1.l[1] = a[index+1]; > temp2.l[0] = b[index]; > temp2.l[1] = b[index + 1]; > > temp3.v = _mm_add_epi8( temp1.v, temp2.v ); > result[index] = temp3.l[0]; > result[index+1] = temp3.l[1]; > > printf("c0=%ld c1=%ld\n", result[index], result[index+1] ); > > } > } > > int main( int count, char ** args ) { > // array of 4 8 byte ints > long a[] = { 1, 2, 3, 4}; > long b[] = { 1, 2, 3, 4}; > long result[] = {0,0,0,0}; > > test_add_long(result, a, b, 4); > > return 0; > } > > > > > > >