Understand the running time of a program compiled with GCC

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello,

I tried to optimize a small parallel program, which is parallelized with OpenMP.
The program itself does operations on arrays and the optimization I did was to
reduce cache miss.

At first, I compiled the program with GCC under Linux and -O3 was enabled. I ran
the program in a quad-core machine but the running time wasn't very stable. I
thought it was because my program was interrupted by other processes or threads
were scheduled to other cores. So I ran the program with each thread attached to
a CPU and with the highest real-time priority in Linux. But still the running
time varied from 60 to 90 milliseconds. I couldn't find any reason to explain
why the running time could be so different, but I still wasn't really surprised
by the result until I used Intel C compiler.

After I compiled the code with Intel's compiler, the running time is always
about 40ms. The performance improvement isn't surprising to me, but I don't know
why the running time doesn't change any more. At beginning, I thought it might
be caused by cache miss. After I profiled the program with AMD CodeAnalyst, I
didn't see many cache misses in either binary executable.

Since I'm doing optimization, I hope to find out the reason. Can anyone tell me
what is the possible reason that can cause time difference?

I also attach the program in case someone would like to take a look.

Thank you,
Zheng Da
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "Aplc.h"
#include "Apl3lib.h"
#define HEAPSIZE 1048576
#define BVBFLE32 512
#define IVBFLE 16384
#define EVBFLE 16384
#define CVBFLE 8192
#define minn -7.237005577332262e75
#define QDCT 1e-13
#define QDRL 16807
#define QDPP 7
int c2[2];
int  lf1= 0x80000000;
int  bvbf[BVBFLE32];
char cvbf[CVBFLE];
int  r0, r1, r2, r8, t, n, i, j, new, time1, time2, th;
int  w0, w1, w2, w3, w4, w5, w6, u0, u1, u2, u3, u4, u5, u6;
int  q0, q1, q2, q3, q4, q5, q6, v0, v1, v2, v3, c6;
int  l0, l1, l2, l3, l4, l5, l6, m0, m1, m2, m3, m4, m5, m6;
int  qw, qi, sw, si, s0, s1, s2, s3, s4, s5, s6, freti;
double d, fretf, epsln;
char fretc;
int  r15= HEAPSIZE;
int  rseed= QDRL, initf;
unsigned int tl,ttl,wa,wb;
unsigned int * twp;
int * v4= bvbf;
char * v7= cvbf;
int * fretip, * v5;
double * fretfp, * v6;
char * fretcp, * lop, * rop, * cad0, * heap;
int lshp[8], rshp[8], cad[50], dl[7], tdl[7], vdl[7];
int * p0, * p1, * lo0, * lo1, * ro0,* ro1,* t0,* t1,* p10,* p11,*p12;
int * lo13, * lo10,* ro13, * ro10, * lo11, * ro11,* lo12,* ro12;
int * id0, * id1, * id2, * id3, * id4, * id5, * id6;
double * p2,* t2,* lo2,* ro2,* p20,* p21,* p22,* lo24,* ro24,* lo25;
double * lo20,* ro20,* lo21,* ro21,* lo22,* ro22,* lo23,* ro23,* ro25;
char *ro30, * p3, *t3, * lo3, * ro3, *pTarg, *pSrc, *pBuff0, *pBuff1;
STOFM2 v49;
STOFM3 v50;
STOFM2 v51;
STOFM3 v52;
STOFM2 v53;
STOFM3 v54;
STOFM3 v28;

main(argc,argv)
	int argc; char * argv[];
{
	char * lparm, * rparm;
	char  ltype, rtype;
	int  g19;
	int lleng= 1;
	FILE *fpl, *fpr, *fopen();
	extern int apl_sec();
	init();
	initf= 1;
	heap=amalloc(HEAPSIZE);
	rt_prio();
#pragma omp parallel
	{
		printf("There are %d threads\n", omp_get_num_threads ());
		if (omp_get_num_threads() > 1)
			thread_attach_cpu(omp_get_thread_num());
	}
	v5=amalloc(65536); v6=amalloc(13107200);
	{
		while (--argc > 0)
			if (argc>0) {lparm = *argv; *argv = *argv + 1;}
			else rparm = *argv;
	}
	if( argv[1] == NULL )
		fpl= a_fopen("MORGAN.LEF","r");
	else
	{
		sprintf(cvbf, "%s.LEF", argv[1]);
		fpl= a_fopen(cvbf, "r");
	}
	fscanf(fpl,"%c %d",&ltype,&lshp[0]);
	if( ltype!= 'I')  {
		printf("The type of the left arguement is mismatched.\n"); exit(99);}
	if( lshp[0]!= 0)  {
		printf("The rank of the left arguement is mismatched.\n"); exit(99);}
	for( i = 1; i < 1 + lshp[0]; ++i)  {
		fscanf(fpl," %d",&lshp[i]); lleng *= lshp[i];}
	while (fgetc(fpl) != '\n')  {}
	fscanf(fpl,"%d",&g19);
	lleng = 1;
	if( argv[1] == NULL )
		fpr= a_fopen("MORGAN.RIG","r");
	else
	{
		sprintf(cvbf, "%s.RIG", argv[1]);
		fpr= a_fopen(cvbf, "r");
	}
	fscanf(fpr,"%c %d",&rtype,&rshp[0]);
	if( rtype!= 'E')  {
		printf("The type of the right arguement is mismatched.\n"); exit(99);}
	if( rshp[0]!= 3)  {
		printf("The rank of the right arguement is mismatched.\n"); exit(99);}
	for( i = 1; i < 1 + rshp[0]; ++i)  {
		fscanf(fpr," %d",&rshp[i]); lleng *= rshp[i];}
	while (fgetc(fpr) != '\n') {}
	rshp[0]= r0 = lleng;inchp2;
	rop= &heap[r15]; p2 = (double *) rop;
//	for(i = 0; i < lleng; ++i)  {
//		fscanf(fpr,"%lf",p2);
//		++p2;
//	}
	time1 = apl_sec();
	MORGAN2  (g19,ro2);
	time2 = apl_sec();
	fprintf(stderr, "\n execution time in ms %d\n",time2-time1);
	/*   OUTPUT generates the following code */
//	p2 = fretfp;
//	w0 = cad[3];
//	w1= cad[1]/w0;
//	for (u0=0; u0<w1; u0++)  {
//		if(w0<11) for(v1=0;v1<w0;v1++){a_prtD0(*p2);++p2;}
//		else for (v1=0;v1<w0;v1++)
//		{a_prtD0(*p2); ++p2; if ((v1%10)==9) printf("\n");}
//		putchar('\n');
//	}
//	putchar('\n');
	free(heap); free(v5); free(v6); system("pause"); exit(0);
}
				/*              CODE SEGMENT FOR FUNCTION MORGAN2       */
MORGAN2  (v19,p20)
	int  v19;
	double* p20;
{
	int num = 0;
	int cache_start[num_threads];
	int max_cache_num;
	int start_time;
	int cache_idx;
	char v8, v11;
	int oldr15, v9, v12, v13, v17;
	double v10, v14, v15, v16;
	STOFM3 v18;
	STOFM3 v21;
	STOFM3 v22;
	STOFM3 v31;
	STOFM3 v32;
	STOFM3 v34;
	STOFM3 v35;
	STOFM3 v36;
	STOFM3 v37;
	STOFM3 v38;
	STOFM3 v39;
	STOFM3 v40;
	STOFM3 v41;
	STOFM3 v42;
	STOFM3 v43;
	STOFM3 v44;
	STOFM3 v45;
	STOFM3 v46;
	STOFM3 v47;
	STOFM3 v48;
	STOFM3 v20;
	v18.maxl = -1;
	v49.maxl= -1; v50.maxl= -1; v51.maxl= -1;
	v52.maxl= -1; v53.maxl= -1; v54.maxl= -1;
	v28.maxl = -1;
	v21.maxl = -1;
	v22.maxl = -1;
	v31.maxl = -1;
	v32.maxl = -1;
	v34.maxl = -1;
	v35.maxl = -1;
	v36.maxl = -1;
	v37.maxl = -1;
	v38.maxl = -1;
	v39.maxl = -1;
	v40.maxl = -1;
	v41.maxl = -1;
	v42.maxl = -1;
	v43.maxl = -1;
	v44.maxl = -1;
	v45.maxl = -1;
	v46.maxl = -1;
	v47.maxl = -1;
	v48.maxl = -1;
	memset(cache_start, 0, sizeof (cache_start));
	r0= rshp[0];
	for (v1=0; v1<3; v1++)
		v20.dims[v1+0]=rshp[v1+1];
	if (initf == 1)
	{
		v20.valp= rop;
		v20.reall= v20.maxl= r0;
		initf= 0;
	} else {
		v20.reall= v20.maxl= r0;
		inchp2; v20.valp= &heap[r15];
		ro2= (double *) v20.valp;
		for (v1=0; v1<r0; v1++)
			ro2[v1]= p20[v1];
	}
	new=0;
	/******************   LINE 1   ******************/
	/* INDEXV generates the following code */
	lo2 = (double *) v20.valp;
	dl[2]= t = 1;
	for (u0=0; u0<2; u0++)
		t= dl[1-u0]= t*v20.dims[2-u0];
	cad[2]= v20.dims[1];
	cad[3]= v20.dims[2];
	n=0;
	n += dl[0]*(1-1);
	tdl[0] =v20.dims[2];
	r0= tdl[0]*v20.dims[1];
	cad[1]=r0;
	INIT_STARTP(v21);
	if (new=r0>v21.maxl) {
		v21.maxl=r0;
		inchp2;cad0= &heap[r15];
	}
	else cad0= v21.valp; p2 = (double *) cad0;
	if (r0==0) goto l2;
	for (u0=0; u0<v20.dims[1]; u0++)
	{
		w0 = dl[1]*u0;
		q0 = tdl[0]*u0;
		for (u1=0; u1<v20.dims[2]; u1++)
			p2[q0+u1] = lo2[n+w0+dl[2]*u1];
	}
l2:
	for (v1=0; v1<2; v1++)
		v21.dims[v1+0]=  cad[v1+2];
	v21.valp = cad0;
	v21.reall= cad[1];
	v21.curr_rows = NULL;
	/******************   LINE 2   ******************/
	/* INDEXV generates the following code */
	lo2 = (double *) v20.valp;
	dl[2]= t = 1;
	for (u0=0; u0<2; u0++)
		t= dl[1-u0]= t*v20.dims[2-u0];
	cad[2]= v20.dims[1];
	cad[3]= v20.dims[2];
	n=0;
	n += dl[0]*(2-1);
	tdl[0] =v20.dims[2];
	r0= tdl[0]*v20.dims[1];
	cad[1]=r0;
	INIT_STARTP(v22);
	if (new=r0>v22.maxl) {
		v22.maxl=r0;
		inchp2;cad0= &heap[r15];
	}
	else cad0= v22.valp;
	p2 = (double *) cad0;
	if (r0==0) goto l3;
	for (u0=0; u0<v20.dims[1]; u0++)
	{
		w0 = dl[1]*u0;
		q0 = tdl[0]*u0;
		for (u1=0; u1<v20.dims[2]; u1++)
			p2[q0+u1] = lo2[n+w0+dl[2]*u1];
	}
l3:
	for (v1=0; v1<2; v1++)
		v22.dims[v1+0]=  cad[v1+2];
	v22.valp = cad0;
	v22.reall= cad[1];
	v22.curr_rows = NULL;
	max_cache_num = 200; // TODO
	int init = 0;
	int local_first = 1;
	start_time = apl_sec();
#pragma omp parallel for default(shared) private(cache_idx, v1, v2, d) firstprivate(local_first)
	for(cache_idx = 0; cache_idx < max_cache_num; cache_idx++)
	{
		int slice;
		STOFM3 v23;
		STOFM3 v24;
		STOFM3 v25;
		STOFM3 v26;
		STOFM3 v27;
		STOFM3 v33;
		v23.maxl = -1;
		v24.maxl = -1;
		v25.maxl = -1;
		v26.maxl = -1;
		v27.maxl = -1;
		v33.maxl = -1;

		if (local_first)
		{
//#pragma omp critical
//			{
//				cache_start[omp_get_thread_num()] = cache_idx;
//			}
//#ifdef DEBUG
//			fprintf (stderr, "thread %d starts at %d\n", omp_get_thread_num(), cache_idx);
//#endif
		}

		if (cache_idx && init == 0)
		{
			while (init == 0)
			{
#pragma omp flush(init)
			}
		}
		/******************   LINE 3   ******************/
		/******************   LINE 4   ******************/
		if (cache_idx == 0)
		{
			/*   GENSCAN generates the following code */
			p2 = (double *) v28.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v21.dims[v1+0];
			r0 =   v21.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			if (r0==0) goto l4;
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v21);
//			SET_ROW(v28, v1);
		}

l4:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3]          ;
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v31);
			v31.source = &v28;
			if (v5[0] > 0)
				v31.startp[0] = v5[0];
			else
				v31.startp[0] = 0;
			if (v5[1] > 0)
				v31.startp[1] = v5[1];
			else
				v31.startp[1] = 0;
			v31.dims[0] = cad[2];
			v31.dims[1] = cad[3];
			memcpy (v31.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v31.valp = v28.valp;
			v31.reall= v31.dims[0] * v31.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v46);
			v46.source = &v28;
			if (v5[0] > 0)
				v46.startp[0] = v5[0];
			else
				v46.startp[0] = 0;
			if (v5[1] > 0)
				v46.startp[1] = v5[1];
			else
				v46.startp[1] = 0;
			v46.dims[0] = cad[2];
			v46.dims[1] = cad[3];
			memcpy (v46.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v46.valp = v28.valp;
			v46.reall= v46.dims[0] * v46.dims[1];
		}

		if (local_first)
		{
			r0=v46.reall;
			INIT_STARTP(v23);
			for (v1=0; v1<2; v1++)
				v23.dims[v1+0]=v46.dims[v1+0];
			memcpy(v23.orig_dims, v23.dims, sizeof(v23.dims));
			cad[1]=r0;
			if (new=r0>v23.maxl) {
				v23.maxl=r0;
				inchp2;v23.valp= &heap[r15];
			}
			v23.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}

		FORAXIS0_CACHE(v23) /* code for - */
		{
//			while (!ROW_AVAIL(&v46, v1));
//			while (!ROW_AVAIL(&v31, v1));

			FORAXIS1(v23)
			{
				if (v2 == 0)
					SLICE_DVAL(v23) = DROPPED_DVAL(v46);
				else
					SLICE_DVAL(v23) = DROPPED_DVAL(v46) - CD_DVAL(v31, -1);
			}
//			SET_ROW(v23, v1);
		}
		/******************   LINE 5   ******************/
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v22.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v22.dims[v1+0];
			r0 =   v22.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l5;
			r0=v22.dims[1];
			r1=v22.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v22);
		}
l5:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v32);
			if (v5[0] > 0)
				v32.startp[0] = v5[0];
			else
				v32.startp[0] = 0;
			if (v5[1] > 0)
				v32.startp[1] = v5[1];
			else
				v32.startp[1] = 0;
			v32.dims[0] = cad[2];
			v32.dims[1] = cad[3];
			memcpy (v32.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v32.valp = v28.valp;
			v32.reall= v32.dims[0] * v32.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v47);
			if (v5[0] > 0)
				v47.startp[0] = v5[0];
			else
				v47.startp[0] = 0;
			if (v5[1] > 0)
				v47.startp[1] = v5[1];
			else
				v47.startp[1] = 0;
			v47.dims[0] = cad[2];
			v47.dims[1] = cad[3];
			memcpy (v47.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v47.valp = v28.valp;
			v47.reall= v47.dims[0] * v47.dims[1];
		}

		if (local_first)
		{
			r0=v47.reall;
			INIT_STARTP(v25);
			for (v1=0; v1<2; v1++)
				v25.dims[v1+0]=v47.dims[v1+0];
			memcpy(v25.orig_dims, v25.dims, sizeof(v25.dims));
			cad[1]=r0;
			if (new=r0>v25.maxl) {
				v25.maxl=r0;
				inchp2;v25.valp= &heap[r15];
			}
			v25.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v25) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v25) = DROPPED_DVAL(v47);
			else
				SLICE_DVAL(v25) = DROPPED_DVAL(v47) - CD_DVAL(v32, -1);
		}
		/******************   LINE 6   ******************/
		if (local_first)
		{
			r0=v21.reall;
			for (v1=0; v1<2; v1++)
				v33.dims[v1+0]=  v21.dims[v1+0];
			cad[1]=r0;
			if (new=r0>v33.maxl) {
				v33.maxl=r0;
				inchp2;v33.valp= &heap[r15];
			}
			v33.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v33.valp;
			lo2 = (double *) v21.valp;
		}
		FORALL2_CACHE(v33)
			DVAL(v33) = DVAL(v21) * DVAL(v21);
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v33.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v33.dims[v1+0];
			r0 =   v33.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l6;
			r0=v33.dims[1];
			r1=v33.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v33);
		}
l6:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v34);
			if (v5[0] > 0)
				v34.startp[0] = v5[0];
			else
				v34.startp[0] = 0;
			if (v5[1] > 0)
				v34.startp[1] = v5[1];
			else
				v34.startp[1] = 0;
			v34.dims[0] = cad[2];
			v34.dims[1] = cad[3];
			memcpy (v34.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v34.valp = v28.valp;
			v34.reall= v34.dims[0] * v34.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3]          ;
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v48);
			if (v5[0] > 0)
				v48.startp[0] = v5[0];
			else
				v48.startp[0] = 0;
			if (v5[1] > 0)
				v48.startp[1] = v5[1];
			else
				v48.startp[1] = 0;
			v48.dims[0] = cad[2];
			v48.dims[1] = cad[3];
			memcpy (v48.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v48.valp = v28.valp;
			v48.reall= v48.dims[0] * v48.dims[1];
		}

		if (local_first)
		{
			r0=v48.reall;
			INIT_STARTP(v24);
			for (v1=0; v1<2; v1++)
				v24.dims[v1+0]=v48.dims[v1+0];
			memcpy(v24.orig_dims, v24.dims, sizeof(v24.dims));
			cad[1]=r0;
			if (new=r0>v24.maxl) {
				v24.maxl=r0;
				inchp2;v24.valp= &heap[r15];
			}
			v24.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v24) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v24) = DROPPED_DVAL(v48);
			else
				SLICE_DVAL(v24) = DROPPED_DVAL(v48) - CD_DVAL(v34, -1);
		}
		/******************   LINE 7   ******************/
		FORALL2_CACHE(v33)
			DVAL(v33) = DVAL(v22) * DVAL(v22);
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v33.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v33.dims[v1+0];
			r0 =   v33.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l7;
			r0=v33.dims[1];
			r1=v33.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v33);
		}
l7:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3]          ;
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v36);
			if (v5[0] > 0)
				v36.startp[0] = v5[0];
			else
				v36.startp[0] = 0;
			if (v5[1] > 0)
				v36.startp[1] = v5[1];
			else
				v36.startp[1] = 0;
			v36.dims[0] = cad[2];
			v36.dims[1] = cad[3];
			memcpy (v36.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v36.valp = v28.valp;
			v36.reall= v36.dims[0] * v36.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v40);
			if (v5[0] > 0)
				v40.startp[0] = v5[0];
			else
				v40.startp[0] = 0;
			if (v5[1] > 0)
				v40.startp[1] = v5[1];
			else
				v40.startp[1] = 0;
			v40.dims[0] = cad[2];
			v40.dims[1] = cad[3];
			memcpy (v40.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v40.valp = v28.valp;
			v40.reall= v40.dims[0] * v40.dims[1];
		}

		if (local_first)
		{
			r0=v40.reall;
			INIT_STARTP(v26);
			for (v1=0; v1<2; v1++)
				v26.dims[v1+0]=  v40.dims[v1+0];
			memcpy(v26.orig_dims, v26.dims, sizeof(v26.dims));
			cad[1]=r0;
			if (new=r0>v26.maxl) {
				v26.maxl=r0;
				inchp2;v26.valp= &heap[r15];
			}
			v26.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v26) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v26) = DROPPED_DVAL(v40);
			else
				SLICE_DVAL(v26) = DROPPED_DVAL(v40) - CD_DVAL(v36, -1);
		}
		/******************   LINE 8   ******************/
		FORALL2_CACHE(v33)
			DVAL(v33) = DVAL(v21) * DVAL(v22);
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v33.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v33.dims[v1+0];
			r0 =   v33.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l8;
			r0=v33.dims[1];
			r1=v33.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v33);
		}
l8:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v38);
			if (v5[0] > 0)
				v38.startp[0] = v5[0];
			else
				v38.startp[0] = 0;
			if (v5[1] > 0)
				v38.startp[1] = v5[1];
			else
				v38.startp[1] = 0;
			v38.dims[0] = cad[2];
			v38.dims[1] = cad[3];
			memcpy (v38.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v38.valp = v28.valp;
			v38.reall= v38.dims[0] * v38.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v39);
			if (v5[0] > 0)
				v39.startp[0] = v5[0];
			else
				v39.startp[0] = 0;
			if (v5[1] > 0)
				v39.startp[1] = v5[1];
			else
				v39.startp[1] = 0;
			v39.dims[0] = cad[2];
			v39.dims[1] = cad[3];
			memcpy (v39.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v39.valp = v28.valp;
			v39.reall= v39.dims[0] * v39.dims[1];
		}

		if (local_first)
		{
			r0=v39.reall;
			INIT_STARTP(v27);
			for (v1=0; v1<2; v1++)
				v27.dims[v1+0]=v39.dims[v1+0];
			memcpy(v27.orig_dims, v27.dims, sizeof(v27.dims));
			cad[1]=r0;
			if (new=r0>v27.maxl) {
				v27.maxl=r0;
				inchp2;v27.valp= &heap[r15];
			}
			v27.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v27) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v27) = DROPPED_DVAL(v39);
			else
				SLICE_DVAL(v27) = DROPPED_DVAL(v39) - CD_DVAL(v38, -1);
		}
		/******************   LINE 9   ******************/
		if (cache_idx == 0)
		{
			r0=v25.reall;
			cad[31]=r0;
			for (v1=0; v1<2; v1++)
				cad[32+v1]=  v25.dims[v1+0];
			lo2 = (double *) v25.valp;
			INIT_STARTP(v18);
			for (v1=0; v1<2; v1++)
				v18.dims[v1+0]=v25.dims[v1+0];
			cad[1]=r0;
			if (new=r0>v18.maxl) {
				v18.maxl=r0;
				inchp2;v18.valp= &heap[r15];
			}
			v18.reall=r0; /* 3 instructions genarated by INCHEAPP */
			memcpy(v18.orig_dims, v18.dims, sizeof(v18.dims));
		}
		FORALL2_CACHE(v25)
		{
			double tmp;
			double tmp2;
			double tmp3;
			if (unlikely(SLICE_DVAL(v25)==0 && 0==(double)v19)) tmp =  1;
			else
				tmp = SLICE_DVAL(v25)/(double)v19;
			tmp = tmp*tmp;
			if (unlikely(SLICE_DVAL(v26)==0 && 0==(double)v19)) tmp2 =  1;
			else
				tmp2 = SLICE_DVAL(v26)/(double)v19;
			tmp = tmp2-tmp;
			tmp =  fabs(tmp);
			tmp = sqrt(tmp);
			if (unlikely(SLICE_DVAL(v23)==0 && 0==(double)v19)) tmp2 =  1;
			else
				tmp2 = SLICE_DVAL(v23)/(double)v19;
			tmp2 = tmp2 * tmp2;
			if (unlikely(SLICE_DVAL(v24)==0 && 0==(double)v19)) tmp3 =  1;
			else
				tmp3 = SLICE_DVAL(v24)/(double)v19;
			tmp2 = tmp2 + tmp3;
			tmp2 = sqrt(tmp2);
			tmp2 = fabs(tmp2);
			tmp = tmp * tmp2;
			v9 = v19*v19; /* code for * */
			tmp2 = SLICE_DVAL(v23) * SLICE_DVAL(v25);
			if (unlikely(tmp2==0 && 0==(double)v9))  tmp2 =  1;
			else
				tmp2 = tmp2/(double)v9;
			if (unlikely(SLICE_DVAL(v27)==0 && 0==(double)v19)) tmp3 =  1;
			else
				tmp3 = SLICE_DVAL(v27)/(double)v19;
			tmp2 = tmp3 - tmp2;
			if (unlikely(tmp2==0 && 0==tmp))  DVAL(v18) =  1;
			else
				DVAL(v18) = tmp2/tmp;
			num++;
		}
		init = 1;
		local_first = 0;
	}
	fprintf(stderr, "the big loop takes %dms, there are %d iterations\n",
			apl_sec() - start_time, num);
	for (v1=0; v1<2; v1++)
		cad[2+v1]=  v18.dims[v1+0];
	cad[1]= v18.reall;
	fprintf (stderr, "size: %d\n", v18.reall);
	fretfp= (double *) v18.valp;
}

[Index of Archives]     [Linux C Programming]     [Linux Kernel]     [eCos]     [Fedora Development]     [Fedora Announce]     [Autoconf]     [The DWARVES Debugging Tools]     [Yosemite Campsites]     [Yosemite News]     [Linux GCC]

  Powered by Linux