Hi !
I have a simple matrix multiplication code:
#include <stdio.h>
#include <time.h>
#include <malloc.h>
main ( int argc, char *argv[] )
{
int i, j, k;
clock_t t1, t2;
double elapsed_time;
int N = 10;
float *a, *b, *c;
if ( argc > 1 )
N = atoi ( argv [ 1 ] );
a = (float *) malloc ( N * N * sizeof ( float ) );
b = (float *) malloc ( N * N * sizeof ( float ) );
c = (float *) malloc ( N * N * sizeof ( float ) );
for ( i = 0; i < N; i++ )
for ( j = 0; j < N; j++ ) {
a [ i * N + j ] = 0.99;
b [ i * N + j ] = 0.33;
c [ i * N + j ] = 0.0;
}
t1 = clock();
for ( i = 0; i < N; i++ )
for ( j = 0; j < N; j++ )
for ( k = 0; k < N; k++ )
c [ i * N + j ] += a [ i * N + k ] * b [ k * N + j ];
t2 = clock();
elapsed_time = ( (double) ( t2 - t1 ) ) / CLOCKS_PER_SEC;
printf ( "%f\n", c [ (N - 1) * N + ( N - 1 ) ] );
printf ( "N = %d Elapsed time = %lf\n", N, elapsed_time );
}
I compile it as
> gcc -O3 ...
and then trying it for different N ( Intel Xeon, 3.00 GHz, 8 Gb memory)
Here are the results:
N Time (secs.)
----------------------------------
500 0.25
512 0.86
520 0.29
1000 4.46
1024 12.5
1100 6.48
1500 20.42
1536 30.43
1600 21.04
2000 46.75
2048 446.61 ( !!! )
2100 59.80
So for N multiple of 512 there is very strong drop of performance.
The question is - why and how to avoid it ?
In fact, there is a cache friendly solution, namely, to interchange the
inner loops as:
for ( i = 0; i < N; i++ )
for ( k = 0; k < N; k++ )
for ( j = 0; j < N; j++ )
c [ i * N + j ] += a [ i * N + k ] * b [ k * N + j ];
but a question remains -
why an unoptimized code works fine, say, for N = 2100 ,
but doesn't work for N = 2048, or, in general, for N multiply of 512?
What is a magic number ?
Thanks,
Yury.