I have attached a short test project that demonstrates what I am doing. I time this simply with the time function, i.e. $ time ./mul_SSE 100000000 real 0m1.037s user 0m1.036s sys 0m0.001s $ time ./mul_SSE4_1 100000000 real 0m2.006s user 0m2.003s sys 0m0.002s I assume that I have prepared the A matrix for SSE a little bit by "dilating" the elements into A = { A11, A11, A11, A11, A12, A12, ... }, while for SSE4.1 I am calling the multiply with the transpose of B. As these matrices are really small, they should be completely in L1, so the movaps operation should have pretty low latency. Since the SSE version uses 4 times more data for A than the SSE4.1 version, I am surprised that given the larger number of data movements for the SSE version it still beats the SSE4.1 version. But maybe I am just not coding this very intelligently. Any suggestions would be very welcome, Thanks already, nick On 03/12/11 01:20, Frederic Marmond wrote: > Hello Nicolas, > > Yes, it's the right place :) > could you please paste your code as well as your benchmark context ? > > Fred > > 2011/3/11 Nicolas Bock <nicolasbock@xxxxxxxxx > <mailto:nicolasbock@xxxxxxxxx>> > > Hello list, > > I am writing an assembly function that multiplies 2 4x4 single precision > matrices. I wrote 2 versions, one using SSE the other using SSE4.1. What > surprised me is that the SSE4.1 version fails to beat the SSE version, > it is in fact slightly slower. > > Is this the right place to ask for help? If anyone is interested I can > post some code which would maybe clarify the situation a bit. > > If this is not the right place, please ignore me... > > nick > >
#CFLAGS = -O0 -g CFLAGS = -O2 -ffast-math all : mul_SSE mul_SSE4_1 mul_SSE : main_SSE.o matrix_multiply_SSE.o gcc -o $@ $^ mul_SSE4_1 : main_SSE4_1.o matrix_multiply_SSE4_1.o gcc -o $@ $^ .PHONY: clean clean: rm -f *.o main_SSE.o : main.c gcc $(CFLAGS) -DSSE -c -o $@ $^ main_SSE4_1.o : main.c gcc $(CFLAGS) -DSSE4_1 -c -o $@ $^ %.o : %.c gcc $(CFLAGS) -c -o $@ $^ %.o : %.S gcc $(CFLAGS) -c -o $@ $^
#include <stdio.h> #include <stdlib.h> #define RANDOM_MATRIX //#define PRINT_DEBUG #if defined(SSE) void matrix_multiply_SSE (const unsigned int N, float *A, float *B, float *C); #elif defined(SSE4_1) void matrix_multiply_SSE4_1 (const unsigned int N, float *A, float *B, float *C); #endif int main (int argc, char **argv) { float __attribute__ ((aligned (64))) A[4][4]; float __attribute__ ((aligned (64))) A_dilated[4][4][4]; float __attribute__ ((aligned (64))) B[4][4]; float __attribute__ ((aligned (64))) B_transpose[4][4]; float __attribute__ ((aligned (64))) C[4][4]; short i, j; unsigned int max_N = 1; /* Parse command line. */ if (argc == 2) { max_N = strtol(argv[1], NULL, 10); } /* Fill matrix with some random stuff. */ for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { #ifndef RANDOM_MATRIX A[i][j] = i*4+j; B[i][j] = i*4+j; C[i][j] = i*4+j; #else A[i][j] = rand()/(float) RAND_MAX; B[i][j] = rand()/(float) RAND_MAX; C[i][j] = rand()/(float) RAND_MAX; #endif B_transpose[j][i] = B[i][j]; A_dilated[i][j][0] = A[i][j]; A_dilated[i][j][1] = A[i][j]; A_dilated[i][j][2] = A[i][j]; A_dilated[i][j][3] = A[i][j]; } } #ifdef SSE matrix_multiply_SSE(max_N, (float*) &A_dilated[0][0], (float*) &B[0][0], (float*) &C[0][0]); #elif defined(SSE4_1) matrix_multiply_SSE4_1(max_N, (float*) &A[0][0], (float*) &B_transpose[0][0], (float*) &C[0][0]); #endif #ifdef PRINT_DEBUG for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { //printf(" %i", (int) C[i][j]); printf(" %f", C[i][j]); } printf("\n"); } #endif return 0; }
# C API: # # void # matrix_multiply_SSE (const unsigned int N, float *A, float *B, float *C); #define N %rdi #define A %rsi #define B %rdx #define C %rcx #define i %rax .text .align 256 .global matrix_multiply_SSE .type matrix_multiply_SSE, @function matrix_multiply_SSE: push i xor i, i test N, N jbe end_loop start_loop: movaps 0x00(C), %xmm0 movaps 0x10(C), %xmm1 movaps 0x20(C), %xmm2 movaps 0x30(C), %xmm3 movaps 0x00(B), %xmm4 movaps 0x10(B), %xmm5 movaps 0x20(B), %xmm6 movaps 0x30(B), %xmm7 # Calculate C(1,:). movaps 0x000(A), %xmm8 movaps 0x010(A), %xmm9 movaps 0x020(A), %xmm10 mulps %xmm4, %xmm8 mulps %xmm5, %xmm9 addps %xmm8, %xmm0 movaps 0x030(A), %xmm11 mulps %xmm6, %xmm10 addps %xmm9, %xmm0 movaps 0x040(A), %xmm12 mulps %xmm7, %xmm11 addps %xmm10, %xmm0 movaps 0x050(A), %xmm13 mulps %xmm4, %xmm12 addps %xmm11, %xmm0 movaps 0x060(A), %xmm14 mulps %xmm5, %xmm13 addps %xmm12, %xmm1 movaps 0x070(A), %xmm15 mulps %xmm6, %xmm14 addps %xmm13, %xmm1 movaps 0x080(A), %xmm8 mulps %xmm7, %xmm15 addps %xmm14, %xmm1 movaps 0x090(A), %xmm9 mulps %xmm4, %xmm8 addps %xmm15, %xmm1 movaps 0x0a0(A), %xmm10 mulps %xmm5, %xmm9 addps %xmm8, %xmm2 movaps 0x0b0(A), %xmm11 mulps %xmm6, %xmm10 addps %xmm9, %xmm2 movaps 0x0c0(A), %xmm12 mulps %xmm7, %xmm11 addps %xmm10, %xmm2 movaps 0x0d0(A), %xmm13 mulps %xmm4, %xmm12 addps %xmm11, %xmm2 movaps 0x0e0(A), %xmm14 mulps %xmm5, %xmm13 addps %xmm12, %xmm3 movaps 0x0f0(A), %xmm15 mulps %xmm6, %xmm14 addps %xmm13, %xmm3 mulps %xmm7, %xmm15 addps %xmm14, %xmm3 addps %xmm15, %xmm3 # Write C back. movaps %xmm0, 0x00(C) movaps %xmm1, 0x10(C) movaps %xmm2, 0x20(C) movaps %xmm3, 0x30(C) inc i cmp N, i jb start_loop end_loop: pop i ret .size matrix_multiply_SSE, .-matrix_multiply_SSE
# C API: # # void # matrix_multiply_SSE4_1 (const unsigned int N, float *A, float *B, float *C); #define N %rdi #define A %rsi #define B %rdx #define C %rcx #define i %rax .text .align 256 .global matrix_multiply_SSE4_1 .type matrix_multiply_SSE4_1, @function matrix_multiply_SSE4_1: push i xor i, i test N, N jbe end_loop start_loop: movaps 0x00(C), %xmm0 movaps 0x10(C), %xmm1 movaps 0x20(C), %xmm2 movaps 0x30(C), %xmm3 movaps 0x00(B), %xmm4 movaps 0x10(B), %xmm5 movaps 0x20(B), %xmm6 movaps 0x30(B), %xmm7 movaps 0x00(A), %xmm8 movaps 0x10(A), %xmm9 # Calculate C(1,:). movaps %xmm4, %xmm10 dpps $0xf1, %xmm8, %xmm10 movaps %xmm5, %xmm11 dpps $0xf2, %xmm8, %xmm11 movaps %xmm6, %xmm12 dpps $0xf4, %xmm8, %xmm12 movaps %xmm7, %xmm13 dpps $0xf8, %xmm8, %xmm13 blendps $0x01, %xmm10, %xmm11 blendps $0x03, %xmm11, %xmm12 blendps $0x07, %xmm12, %xmm13 addps %xmm13, %xmm0 movaps 0x20(A), %xmm8 # Calculate C(2,:). movaps %xmm4, %xmm10 dpps $0xf1, %xmm9, %xmm10 movaps %xmm5, %xmm11 dpps $0xf2, %xmm9, %xmm11 movaps %xmm6, %xmm12 dpps $0xf4, %xmm9, %xmm12 movaps %xmm7, %xmm13 dpps $0xf8, %xmm9, %xmm13 blendps $0x01, %xmm10, %xmm11 blendps $0x03, %xmm11, %xmm12 blendps $0x07, %xmm12, %xmm13 addps %xmm13, %xmm1 movaps 0x30(A), %xmm9 # Calculate C(3,:). movaps %xmm4, %xmm10 dpps $0xf1, %xmm8, %xmm10 movaps %xmm5, %xmm11 dpps $0xf2, %xmm8, %xmm11 movaps %xmm6, %xmm12 dpps $0xf4, %xmm8, %xmm12 movaps %xmm7, %xmm13 dpps $0xf8, %xmm8, %xmm13 blendps $0x01, %xmm10, %xmm11 blendps $0x03, %xmm11, %xmm12 blendps $0x07, %xmm12, %xmm13 addps %xmm13, %xmm2 # Calculate C(4,:). movaps %xmm4, %xmm10 dpps $0xf1, %xmm9, %xmm10 movaps %xmm5, %xmm11 dpps $0xf2, %xmm9, %xmm11 movaps %xmm6, %xmm12 dpps $0xf4, %xmm9, %xmm12 movaps %xmm7, %xmm13 dpps $0xf8, %xmm9, %xmm13 blendps $0x01, %xmm10, %xmm11 blendps $0x03, %xmm11, %xmm12 blendps $0x07, %xmm12, %xmm13 addps %xmm13, %xmm3 # Write C back. movaps %xmm0, 0x00(C) movaps %xmm1, 0x10(C) movaps %xmm2, 0x20(C) movaps %xmm3, 0x30(C) inc i cmp N, i jb start_loop end_loop: pop i ret .size matrix_multiply_SSE4_1, .-matrix_multiply_SSE4_1
Attachment:
signature.asc
Description: OpenPGP digital signature
- References:
- 4x4 single-precision matrix product with SSE
- From: Nicolas Bock
- 4x4 single-precision matrix product with SSE
- Prev by Date: Re: 4x4 single-precision matrix product with SSE
- Next by Date: Fwd: 4x4 single-precision matrix product with SSE
- Previous by thread: Re: 4x4 single-precision matrix product with SSE
- Next by thread: Fwd: 4x4 single-precision matrix product with SSE
- Index(es):
![]() |