Hello, I tried to optimize a small parallel program, which is parallelized with OpenMP. The program itself does operations on arrays and the optimization I did was to reduce cache miss. At first, I compiled the program with GCC under Linux and -O3 was enabled. I ran the program in a quad-core machine but the running time wasn't very stable. I thought it was because my program was interrupted by other processes or threads were scheduled to other cores. So I ran the program with each thread attached to a CPU and with the highest real-time priority in Linux. But still the running time varied from 60 to 90 milliseconds. I couldn't find any reason to explain why the running time could be so different, but I still wasn't really surprised by the result until I used Intel C compiler. After I compiled the code with Intel's compiler, the running time is always about 40ms. The performance improvement isn't surprising to me, but I don't know why the running time doesn't change any more. At beginning, I thought it might be caused by cache miss. After I profiled the program with AMD CodeAnalyst, I didn't see many cache misses in either binary executable. Since I'm doing optimization, I hope to find out the reason. Can anyone tell me what is the possible reason that can cause time difference? I also attach the program in case someone would like to take a look. Thank you, Zheng Da
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include "Aplc.h" #include "Apl3lib.h" #define HEAPSIZE 1048576 #define BVBFLE32 512 #define IVBFLE 16384 #define EVBFLE 16384 #define CVBFLE 8192 #define minn -7.237005577332262e75 #define QDCT 1e-13 #define QDRL 16807 #define QDPP 7 int c2[2]; int lf1= 0x80000000; int bvbf[BVBFLE32]; char cvbf[CVBFLE]; int r0, r1, r2, r8, t, n, i, j, new, time1, time2, th; int w0, w1, w2, w3, w4, w5, w6, u0, u1, u2, u3, u4, u5, u6; int q0, q1, q2, q3, q4, q5, q6, v0, v1, v2, v3, c6; int l0, l1, l2, l3, l4, l5, l6, m0, m1, m2, m3, m4, m5, m6; int qw, qi, sw, si, s0, s1, s2, s3, s4, s5, s6, freti; double d, fretf, epsln; char fretc; int r15= HEAPSIZE; int rseed= QDRL, initf; unsigned int tl,ttl,wa,wb; unsigned int * twp; int * v4= bvbf; char * v7= cvbf; int * fretip, * v5; double * fretfp, * v6; char * fretcp, * lop, * rop, * cad0, * heap; int lshp[8], rshp[8], cad[50], dl[7], tdl[7], vdl[7]; int * p0, * p1, * lo0, * lo1, * ro0,* ro1,* t0,* t1,* p10,* p11,*p12; int * lo13, * lo10,* ro13, * ro10, * lo11, * ro11,* lo12,* ro12; int * id0, * id1, * id2, * id3, * id4, * id5, * id6; double * p2,* t2,* lo2,* ro2,* p20,* p21,* p22,* lo24,* ro24,* lo25; double * lo20,* ro20,* lo21,* ro21,* lo22,* ro22,* lo23,* ro23,* ro25; char *ro30, * p3, *t3, * lo3, * ro3, *pTarg, *pSrc, *pBuff0, *pBuff1; STOFM2 v49; STOFM3 v50; STOFM2 v51; STOFM3 v52; STOFM2 v53; STOFM3 v54; STOFM3 v28; main(argc,argv) int argc; char * argv[]; { char * lparm, * rparm; char ltype, rtype; int g19; int lleng= 1; FILE *fpl, *fpr, *fopen(); extern int apl_sec(); init(); initf= 1; heap=amalloc(HEAPSIZE); rt_prio(); #pragma omp parallel { printf("There are %d threads\n", omp_get_num_threads ()); if (omp_get_num_threads() > 1) thread_attach_cpu(omp_get_thread_num()); } v5=amalloc(65536); v6=amalloc(13107200); { while (--argc > 0) if (argc>0) {lparm = *argv; *argv = *argv + 1;} else rparm = *argv; } if( argv[1] == NULL ) fpl= a_fopen("MORGAN.LEF","r"); else { sprintf(cvbf, "%s.LEF", argv[1]); fpl= a_fopen(cvbf, "r"); } fscanf(fpl,"%c %d",<ype,&lshp[0]); if( ltype!= 'I') { printf("The type of the left arguement is mismatched.\n"); exit(99);} if( lshp[0]!= 0) { printf("The rank of the left arguement is mismatched.\n"); exit(99);} for( i = 1; i < 1 + lshp[0]; ++i) { fscanf(fpl," %d",&lshp[i]); lleng *= lshp[i];} while (fgetc(fpl) != '\n') {} fscanf(fpl,"%d",&g19); lleng = 1; if( argv[1] == NULL ) fpr= a_fopen("MORGAN.RIG","r"); else { sprintf(cvbf, "%s.RIG", argv[1]); fpr= a_fopen(cvbf, "r"); } fscanf(fpr,"%c %d",&rtype,&rshp[0]); if( rtype!= 'E') { printf("The type of the right arguement is mismatched.\n"); exit(99);} if( rshp[0]!= 3) { printf("The rank of the right arguement is mismatched.\n"); exit(99);} for( i = 1; i < 1 + rshp[0]; ++i) { fscanf(fpr," %d",&rshp[i]); lleng *= rshp[i];} while (fgetc(fpr) != '\n') {} rshp[0]= r0 = lleng;inchp2; rop= &heap[r15]; p2 = (double *) rop; // for(i = 0; i < lleng; ++i) { // fscanf(fpr,"%lf",p2); // ++p2; // } time1 = apl_sec(); MORGAN2 (g19,ro2); time2 = apl_sec(); fprintf(stderr, "\n execution time in ms %d\n",time2-time1); /* OUTPUT generates the following code */ // p2 = fretfp; // w0 = cad[3]; // w1= cad[1]/w0; // for (u0=0; u0<w1; u0++) { // if(w0<11) for(v1=0;v1<w0;v1++){a_prtD0(*p2);++p2;} // else for (v1=0;v1<w0;v1++) // {a_prtD0(*p2); ++p2; if ((v1%10)==9) printf("\n");} // putchar('\n'); // } // putchar('\n'); free(heap); free(v5); free(v6); system("pause"); exit(0); } /* CODE SEGMENT FOR FUNCTION MORGAN2 */ MORGAN2 (v19,p20) int v19; double* p20; { int num = 0; int cache_start[num_threads]; int max_cache_num; int start_time; int cache_idx; char v8, v11; int oldr15, v9, v12, v13, v17; double v10, v14, v15, v16; STOFM3 v18; STOFM3 v21; STOFM3 v22; STOFM3 v31; STOFM3 v32; STOFM3 v34; STOFM3 v35; STOFM3 v36; STOFM3 v37; STOFM3 v38; STOFM3 v39; STOFM3 v40; STOFM3 v41; STOFM3 v42; STOFM3 v43; STOFM3 v44; STOFM3 v45; STOFM3 v46; STOFM3 v47; STOFM3 v48; STOFM3 v20; v18.maxl = -1; v49.maxl= -1; v50.maxl= -1; v51.maxl= -1; v52.maxl= -1; v53.maxl= -1; v54.maxl= -1; v28.maxl = -1; v21.maxl = -1; v22.maxl = -1; v31.maxl = -1; v32.maxl = -1; v34.maxl = -1; v35.maxl = -1; v36.maxl = -1; v37.maxl = -1; v38.maxl = -1; v39.maxl = -1; v40.maxl = -1; v41.maxl = -1; v42.maxl = -1; v43.maxl = -1; v44.maxl = -1; v45.maxl = -1; v46.maxl = -1; v47.maxl = -1; v48.maxl = -1; memset(cache_start, 0, sizeof (cache_start)); r0= rshp[0]; for (v1=0; v1<3; v1++) v20.dims[v1+0]=rshp[v1+1]; if (initf == 1) { v20.valp= rop; v20.reall= v20.maxl= r0; initf= 0; } else { v20.reall= v20.maxl= r0; inchp2; v20.valp= &heap[r15]; ro2= (double *) v20.valp; for (v1=0; v1<r0; v1++) ro2[v1]= p20[v1]; } new=0; /****************** LINE 1 ******************/ /* INDEXV generates the following code */ lo2 = (double *) v20.valp; dl[2]= t = 1; for (u0=0; u0<2; u0++) t= dl[1-u0]= t*v20.dims[2-u0]; cad[2]= v20.dims[1]; cad[3]= v20.dims[2]; n=0; n += dl[0]*(1-1); tdl[0] =v20.dims[2]; r0= tdl[0]*v20.dims[1]; cad[1]=r0; INIT_STARTP(v21); if (new=r0>v21.maxl) { v21.maxl=r0; inchp2;cad0= &heap[r15]; } else cad0= v21.valp; p2 = (double *) cad0; if (r0==0) goto l2; for (u0=0; u0<v20.dims[1]; u0++) { w0 = dl[1]*u0; q0 = tdl[0]*u0; for (u1=0; u1<v20.dims[2]; u1++) p2[q0+u1] = lo2[n+w0+dl[2]*u1]; } l2: for (v1=0; v1<2; v1++) v21.dims[v1+0]= cad[v1+2]; v21.valp = cad0; v21.reall= cad[1]; v21.curr_rows = NULL; /****************** LINE 2 ******************/ /* INDEXV generates the following code */ lo2 = (double *) v20.valp; dl[2]= t = 1; for (u0=0; u0<2; u0++) t= dl[1-u0]= t*v20.dims[2-u0]; cad[2]= v20.dims[1]; cad[3]= v20.dims[2]; n=0; n += dl[0]*(2-1); tdl[0] =v20.dims[2]; r0= tdl[0]*v20.dims[1]; cad[1]=r0; INIT_STARTP(v22); if (new=r0>v22.maxl) { v22.maxl=r0; inchp2;cad0= &heap[r15]; } else cad0= v22.valp; p2 = (double *) cad0; if (r0==0) goto l3; for (u0=0; u0<v20.dims[1]; u0++) { w0 = dl[1]*u0; q0 = tdl[0]*u0; for (u1=0; u1<v20.dims[2]; u1++) p2[q0+u1] = lo2[n+w0+dl[2]*u1]; } l3: for (v1=0; v1<2; v1++) v22.dims[v1+0]= cad[v1+2]; v22.valp = cad0; v22.reall= cad[1]; v22.curr_rows = NULL; max_cache_num = 200; // TODO int init = 0; int local_first = 1; start_time = apl_sec(); #pragma omp parallel for default(shared) private(cache_idx, v1, v2, d) firstprivate(local_first) for(cache_idx = 0; cache_idx < max_cache_num; cache_idx++) { int slice; STOFM3 v23; STOFM3 v24; STOFM3 v25; STOFM3 v26; STOFM3 v27; STOFM3 v33; v23.maxl = -1; v24.maxl = -1; v25.maxl = -1; v26.maxl = -1; v27.maxl = -1; v33.maxl = -1; if (local_first) { //#pragma omp critical // { // cache_start[omp_get_thread_num()] = cache_idx; // } //#ifdef DEBUG // fprintf (stderr, "thread %d starts at %d\n", omp_get_thread_num(), cache_idx); //#endif } if (cache_idx && init == 0) { while (init == 0) { #pragma omp flush(init) } } /****************** LINE 3 ******************/ /****************** LINE 4 ******************/ if (cache_idx == 0) { /* GENSCAN generates the following code */ p2 = (double *) v28.valp; INIT_STARTP(v28); for (v1=0; v1<2; v1++) v28.orig_dims[v1+0] = v28.dims[v1+0]= v21.dims[v1+0]; r0 = v21.reall; cad[1]=r0; if (new=r0>v28.maxl) { v28.maxl=r0; inchp2;v28.valp= &heap[r15]; } v28.reall=r0; /* 3 instructions genarated by INCHEAPP */ if (r0==0) goto l4; } FORAXIS0_CACHE(v28) { v2 = 0; d = 0; FORAXIS1(v28) DVAL(v28)=d=d+DVAL(v21); // SET_ROW(v28, v1); } l4: if (cache_idx == 0) v9 = -v19; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3] ; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v31); v31.source = &v28; if (v5[0] > 0) v31.startp[0] = v5[0]; else v31.startp[0] = 0; if (v5[1] > 0) v31.startp[1] = v5[1]; else v31.startp[1] = 0; v31.dims[0] = cad[2]; v31.dims[1] = cad[3]; memcpy (v31.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v31.valp = v28.valp; v31.reall= v31.dims[0] * v31.dims[1]; } /***** PFCATENA *****/ if (cache_idx == 0) v9 = v19-1; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v46); v46.source = &v28; if (v5[0] > 0) v46.startp[0] = v5[0]; else v46.startp[0] = 0; if (v5[1] > 0) v46.startp[1] = v5[1]; else v46.startp[1] = 0; v46.dims[0] = cad[2]; v46.dims[1] = cad[3]; memcpy (v46.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v46.valp = v28.valp; v46.reall= v46.dims[0] * v46.dims[1]; } if (local_first) { r0=v46.reall; INIT_STARTP(v23); for (v1=0; v1<2; v1++) v23.dims[v1+0]=v46.dims[v1+0]; memcpy(v23.orig_dims, v23.dims, sizeof(v23.dims)); cad[1]=r0; if (new=r0>v23.maxl) { v23.maxl=r0; inchp2;v23.valp= &heap[r15]; } v23.reall=r0; /* 3 instructions genarated by INCHEAPP */ } FORAXIS0_CACHE(v23) /* code for - */ { // while (!ROW_AVAIL(&v46, v1)); // while (!ROW_AVAIL(&v31, v1)); FORAXIS1(v23) { if (v2 == 0) SLICE_DVAL(v23) = DROPPED_DVAL(v46); else SLICE_DVAL(v23) = DROPPED_DVAL(v46) - CD_DVAL(v31, -1); } // SET_ROW(v23, v1); } /****************** LINE 5 ******************/ /* GENSCAN generates the following code */ if (cache_idx == 0) { p2 = (double *) v28.valp; ro2 = (double *) v22.valp; INIT_STARTP(v28); for (v1=0; v1<2; v1++) v28.orig_dims[v1+0] = v28.dims[v1+0]= v22.dims[v1+0]; r0 = v22.reall; cad[1]=r0; if (new=r0>v28.maxl) { v28.maxl=r0; inchp2;v28.valp= &heap[r15]; } v28.reall=r0; /* 3 instructions genarated by INCHEAPP */ p2 = (double *) v28.valp; if (r0==0) goto l5; r0=v22.dims[1]; r1=v22.dims[0]; } FORAXIS0_CACHE(v28) { v2 = 0; d = 0; FORAXIS1(v28) DVAL(v28)=d=d+DVAL(v22); } l5: if (cache_idx == 0) v9 = -v19; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v32); if (v5[0] > 0) v32.startp[0] = v5[0]; else v32.startp[0] = 0; if (v5[1] > 0) v32.startp[1] = v5[1]; else v32.startp[1] = 0; v32.dims[0] = cad[2]; v32.dims[1] = cad[3]; memcpy (v32.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v32.valp = v28.valp; v32.reall= v32.dims[0] * v32.dims[1]; } /***** PFCATENA *****/ if (cache_idx == 0) v9 = v19-1; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v47); if (v5[0] > 0) v47.startp[0] = v5[0]; else v47.startp[0] = 0; if (v5[1] > 0) v47.startp[1] = v5[1]; else v47.startp[1] = 0; v47.dims[0] = cad[2]; v47.dims[1] = cad[3]; memcpy (v47.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v47.valp = v28.valp; v47.reall= v47.dims[0] * v47.dims[1]; } if (local_first) { r0=v47.reall; INIT_STARTP(v25); for (v1=0; v1<2; v1++) v25.dims[v1+0]=v47.dims[v1+0]; memcpy(v25.orig_dims, v25.dims, sizeof(v25.dims)); cad[1]=r0; if (new=r0>v25.maxl) { v25.maxl=r0; inchp2;v25.valp= &heap[r15]; } v25.reall=r0; /* 3 instructions genarated by INCHEAPP */ } FORALL2_CACHE(v25) /* code for - */ { if (v2 == 0) SLICE_DVAL(v25) = DROPPED_DVAL(v47); else SLICE_DVAL(v25) = DROPPED_DVAL(v47) - CD_DVAL(v32, -1); } /****************** LINE 6 ******************/ if (local_first) { r0=v21.reall; for (v1=0; v1<2; v1++) v33.dims[v1+0]= v21.dims[v1+0]; cad[1]=r0; if (new=r0>v33.maxl) { v33.maxl=r0; inchp2;v33.valp= &heap[r15]; } v33.reall=r0; /* 3 instructions genarated by INCHEAPP */ p2 = (double *) v33.valp; lo2 = (double *) v21.valp; } FORALL2_CACHE(v33) DVAL(v33) = DVAL(v21) * DVAL(v21); /* GENSCAN generates the following code */ if (cache_idx == 0) { p2 = (double *) v28.valp; ro2 = (double *) v33.valp; INIT_STARTP(v28); for (v1=0; v1<2; v1++) v28.orig_dims[v1+0] = v28.dims[v1+0]= v33.dims[v1+0]; r0 = v33.reall; cad[1]=r0; if (new=r0>v28.maxl) { v28.maxl=r0; inchp2;v28.valp= &heap[r15]; } v28.reall=r0; /* 3 instructions genarated by INCHEAPP */ p2 = (double *) v28.valp; if (r0==0) goto l6; r0=v33.dims[1]; r1=v33.dims[0]; } FORAXIS0_CACHE(v28) { v2 = 0; d = 0; FORAXIS1(v28) DVAL(v28)=d=d+DVAL(v33); } l6: if (cache_idx == 0) v9 = -v19; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v34); if (v5[0] > 0) v34.startp[0] = v5[0]; else v34.startp[0] = 0; if (v5[1] > 0) v34.startp[1] = v5[1]; else v34.startp[1] = 0; v34.dims[0] = cad[2]; v34.dims[1] = cad[3]; memcpy (v34.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v34.valp = v28.valp; v34.reall= v34.dims[0] * v34.dims[1]; } /***** PFCATENA *****/ if (cache_idx == 0) v9 = v19-1; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3] ; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v48); if (v5[0] > 0) v48.startp[0] = v5[0]; else v48.startp[0] = 0; if (v5[1] > 0) v48.startp[1] = v5[1]; else v48.startp[1] = 0; v48.dims[0] = cad[2]; v48.dims[1] = cad[3]; memcpy (v48.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v48.valp = v28.valp; v48.reall= v48.dims[0] * v48.dims[1]; } if (local_first) { r0=v48.reall; INIT_STARTP(v24); for (v1=0; v1<2; v1++) v24.dims[v1+0]=v48.dims[v1+0]; memcpy(v24.orig_dims, v24.dims, sizeof(v24.dims)); cad[1]=r0; if (new=r0>v24.maxl) { v24.maxl=r0; inchp2;v24.valp= &heap[r15]; } v24.reall=r0; /* 3 instructions genarated by INCHEAPP */ } FORALL2_CACHE(v24) /* code for - */ { if (v2 == 0) SLICE_DVAL(v24) = DROPPED_DVAL(v48); else SLICE_DVAL(v24) = DROPPED_DVAL(v48) - CD_DVAL(v34, -1); } /****************** LINE 7 ******************/ FORALL2_CACHE(v33) DVAL(v33) = DVAL(v22) * DVAL(v22); /* GENSCAN generates the following code */ if (cache_idx == 0) { p2 = (double *) v28.valp; ro2 = (double *) v33.valp; INIT_STARTP(v28); for (v1=0; v1<2; v1++) v28.orig_dims[v1+0] = v28.dims[v1+0]= v33.dims[v1+0]; r0 = v33.reall; cad[1]=r0; if (new=r0>v28.maxl) { v28.maxl=r0; inchp2;v28.valp= &heap[r15]; } v28.reall=r0; /* 3 instructions genarated by INCHEAPP */ p2 = (double *) v28.valp; if (r0==0) goto l7; r0=v33.dims[1]; r1=v33.dims[0]; } FORAXIS0_CACHE(v28) { v2 = 0; d = 0; FORAXIS1(v28) DVAL(v28)=d=d+DVAL(v33); } l7: if (cache_idx == 0) v9 = -v19; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3] ; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v36); if (v5[0] > 0) v36.startp[0] = v5[0]; else v36.startp[0] = 0; if (v5[1] > 0) v36.startp[1] = v5[1]; else v36.startp[1] = 0; v36.dims[0] = cad[2]; v36.dims[1] = cad[3]; memcpy (v36.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v36.valp = v28.valp; v36.reall= v36.dims[0] * v36.dims[1]; } /***** PFCATENA *****/ if (cache_idx == 0) v9 = v19-1; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v40); if (v5[0] > 0) v40.startp[0] = v5[0]; else v40.startp[0] = 0; if (v5[1] > 0) v40.startp[1] = v5[1]; else v40.startp[1] = 0; v40.dims[0] = cad[2]; v40.dims[1] = cad[3]; memcpy (v40.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v40.valp = v28.valp; v40.reall= v40.dims[0] * v40.dims[1]; } if (local_first) { r0=v40.reall; INIT_STARTP(v26); for (v1=0; v1<2; v1++) v26.dims[v1+0]= v40.dims[v1+0]; memcpy(v26.orig_dims, v26.dims, sizeof(v26.dims)); cad[1]=r0; if (new=r0>v26.maxl) { v26.maxl=r0; inchp2;v26.valp= &heap[r15]; } v26.reall=r0; /* 3 instructions genarated by INCHEAPP */ } FORALL2_CACHE(v26) /* code for - */ { if (v2 == 0) SLICE_DVAL(v26) = DROPPED_DVAL(v40); else SLICE_DVAL(v26) = DROPPED_DVAL(v40) - CD_DVAL(v36, -1); } /****************** LINE 8 ******************/ FORALL2_CACHE(v33) DVAL(v33) = DVAL(v21) * DVAL(v22); /* GENSCAN generates the following code */ if (cache_idx == 0) { p2 = (double *) v28.valp; ro2 = (double *) v33.valp; INIT_STARTP(v28); for (v1=0; v1<2; v1++) v28.orig_dims[v1+0] = v28.dims[v1+0]= v33.dims[v1+0]; r0 = v33.reall; cad[1]=r0; if (new=r0>v28.maxl) { v28.maxl=r0; inchp2;v28.valp= &heap[r15]; } v28.reall=r0; /* 3 instructions genarated by INCHEAPP */ p2 = (double *) v28.valp; if (r0==0) goto l8; r0=v33.dims[1]; r1=v33.dims[0]; } FORAXIS0_CACHE(v28) { v2 = 0; d = 0; FORAXIS1(v28) DVAL(v28)=d=d+DVAL(v33); } l8: if (cache_idx == 0) v9 = -v19; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v38); if (v5[0] > 0) v38.startp[0] = v5[0]; else v38.startp[0] = 0; if (v5[1] > 0) v38.startp[1] = v5[1]; else v38.startp[1] = 0; v38.dims[0] = cad[2]; v38.dims[1] = cad[3]; memcpy (v38.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v38.valp = v28.valp; v38.reall= v38.dims[0] * v38.dims[1]; } /***** PFCATENA *****/ if (cache_idx == 0) v9 = v19-1; /* code for - */ /***** PFCATENA *****/ if (cache_idx == 0) { r0 = cad[1]= 1+1; cad[1] =r0 =2; v5[0] = 0; v5[1]= v9; /* PFDROP generates the following code */ ro2 = (double *) v28.valp; dl[1]= t = 1; for (u0=0; u0<1; u0++) t= dl[0-u0]= t*v28.dims[1-u0]; r0=t=1; cad[3] = max(0, v28.dims[1]-abs(v5[1])); t=tdl[0] = t*cad[3]; cad[2] = max(0,v28.dims[0]-abs(v5[0])); INIT_STARTP(v39); if (v5[0] > 0) v39.startp[0] = v5[0]; else v39.startp[0] = 0; if (v5[1] > 0) v39.startp[1] = v5[1]; else v39.startp[1] = 0; v39.dims[0] = cad[2]; v39.dims[1] = cad[3]; memcpy (v39.orig_dims, v28.orig_dims, sizeof (v28.orig_dims)); v39.valp = v28.valp; v39.reall= v39.dims[0] * v39.dims[1]; } if (local_first) { r0=v39.reall; INIT_STARTP(v27); for (v1=0; v1<2; v1++) v27.dims[v1+0]=v39.dims[v1+0]; memcpy(v27.orig_dims, v27.dims, sizeof(v27.dims)); cad[1]=r0; if (new=r0>v27.maxl) { v27.maxl=r0; inchp2;v27.valp= &heap[r15]; } v27.reall=r0; /* 3 instructions genarated by INCHEAPP */ } FORALL2_CACHE(v27) /* code for - */ { if (v2 == 0) SLICE_DVAL(v27) = DROPPED_DVAL(v39); else SLICE_DVAL(v27) = DROPPED_DVAL(v39) - CD_DVAL(v38, -1); } /****************** LINE 9 ******************/ if (cache_idx == 0) { r0=v25.reall; cad[31]=r0; for (v1=0; v1<2; v1++) cad[32+v1]= v25.dims[v1+0]; lo2 = (double *) v25.valp; INIT_STARTP(v18); for (v1=0; v1<2; v1++) v18.dims[v1+0]=v25.dims[v1+0]; cad[1]=r0; if (new=r0>v18.maxl) { v18.maxl=r0; inchp2;v18.valp= &heap[r15]; } v18.reall=r0; /* 3 instructions genarated by INCHEAPP */ memcpy(v18.orig_dims, v18.dims, sizeof(v18.dims)); } FORALL2_CACHE(v25) { double tmp; double tmp2; double tmp3; if (unlikely(SLICE_DVAL(v25)==0 && 0==(double)v19)) tmp = 1; else tmp = SLICE_DVAL(v25)/(double)v19; tmp = tmp*tmp; if (unlikely(SLICE_DVAL(v26)==0 && 0==(double)v19)) tmp2 = 1; else tmp2 = SLICE_DVAL(v26)/(double)v19; tmp = tmp2-tmp; tmp = fabs(tmp); tmp = sqrt(tmp); if (unlikely(SLICE_DVAL(v23)==0 && 0==(double)v19)) tmp2 = 1; else tmp2 = SLICE_DVAL(v23)/(double)v19; tmp2 = tmp2 * tmp2; if (unlikely(SLICE_DVAL(v24)==0 && 0==(double)v19)) tmp3 = 1; else tmp3 = SLICE_DVAL(v24)/(double)v19; tmp2 = tmp2 + tmp3; tmp2 = sqrt(tmp2); tmp2 = fabs(tmp2); tmp = tmp * tmp2; v9 = v19*v19; /* code for * */ tmp2 = SLICE_DVAL(v23) * SLICE_DVAL(v25); if (unlikely(tmp2==0 && 0==(double)v9)) tmp2 = 1; else tmp2 = tmp2/(double)v9; if (unlikely(SLICE_DVAL(v27)==0 && 0==(double)v19)) tmp3 = 1; else tmp3 = SLICE_DVAL(v27)/(double)v19; tmp2 = tmp3 - tmp2; if (unlikely(tmp2==0 && 0==tmp)) DVAL(v18) = 1; else DVAL(v18) = tmp2/tmp; num++; } init = 1; local_first = 0; } fprintf(stderr, "the big loop takes %dms, there are %d iterations\n", apl_sec() - start_time, num); for (v1=0; v1<2; v1++) cad[2+v1]= v18.dims[v1+0]; cad[1]= v18.reall; fprintf (stderr, "size: %d\n", v18.reall); fretfp= (double *) v18.valp; }