register allocation on IA64

<john.gill@xxxxxxxxxxxx> · Thu, 17 Mar 2005 17:24:43 -0000

I have a simple piece of code:

long
addem (long *ptr)
{
    return (ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5]);   
}

This generates interleaved load and add instructions in a fairly 
efficient way using a different register to hold a loaded value.
However, the additions of the loaded values are serialized through
r8 (the return value register).
These are the relevant instructions from the assembler output below:
	add r8 = r8, r16
	add r8 = r8, r17
	add r8 = r8, r18
	add r8 = r8, r15
	add r8 = r8, r14

This does not seem to be the most efficient method because if I could
pair intermediate add results using a different (or reuse a) register 
for the result I could obtain better packing of the insns in the VLIW slots. 
However, this probably means using more registers to hold the 
intermediate added values. 

It seems to me that the insn scheduler has done the best
job it can given the registers allocation scheme it has inherited.
Is there any way to get gcc to be more aggressive with its register
allocation? Is there a compile time flag that I'm missing? I guess 
this is something that would be of generic benefit to all machines 
with lots of registers (I have noticed a similar problem for SPARC code). 

I can of course rewrite the code so that gcc emits more efficient insns
but this is ugly and a nightmare to support long term ... 

long
addem2 (long *ptr)
{
    register long val0 = ptr[0];
    register long val1 = ptr[1];
    register long val2 = ptr[2];
    register long val3 = ptr[3];
    register long val4 = ptr[4];
    register long val5 = ptr[5];

    register long val6 = val0+val1;
    register long val7 = val2+val3;
    register long val8 = val4+val5;

    register long val9 = val6+val7;

    return (val8 + val9);   
}

Thanks in advance

John

compiled with gcc -O2 -S add.c -o add.s gives:

add.s
==================================
	.file	"add.c"
	.pred.safe_across_calls p1-p5,p16-p63
.text
	.align 16
	.global addem#
	.proc addem#
addem:
	.prologue
	.body
	adds r15 = 8, r32
	adds r14 = 16, r32
	ld8 r8 = [r32]
	;;
	ld8 r16 = [r15]
	ld8 r17 = [r14]
	adds r15 = 24, r32
	;;
	add r8 = r8, r16      
	adds r14 = 32, r32
	ld8 r18 = [r15]
	;;
	add r8 = r8, r17       <<<<< serialized add
	adds r32 = 40, r32
	ld8 r15 = [r14]
	;;
	add r8 = r8, r18       <<<<< serialized add
	ld8 r14 = [r32]
	;;
	add r8 = r8, r15       <<<<< serialized add
	;;
	add r8 = r8, r14       <<<<< serialized add
	br.ret.sptk.many b0
	.endp addem#
	.global addem2#
	.proc addem2#
addem2:
	.prologue
	.body
	adds r18 = 24, r32
	mov r14 = r32
	adds r17 = 16, r32
	;;
	ld8 r19 = [r18]
	ld8 r15 = [r14], 8
	ld8 r16 = [r17]
	adds r18 = 40, r32
	;;
	ld8 r17 = [r14]
	add r16 = r16, r19       <<<<< paired add
	adds r32 = 32, r32
	;;
	add r15 = r15, r17       <<<<< paired add
	ld8 r14 = [r18]
	ld8 r8 = [r32]
	;;
	add r15 = r15, r16       <<<<< paired add
	add r8 = r8, r14         <<<<< paired add
	;;
	add r8 = r8, r15
	br.ret.sptk.many b0
	.ident	"GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.2 2.96-128.7.2)"