Inefficient stack usage

Hiroshi Shimamoto <h-shimamoto@xxxxxxxxxxxxx> · Fri, 12 Dec 2008 10:43:29 -0800

Hello,

I noticed that the stack usage of the code gcc-4.x generated looks inefficient
on x86 and x86_64. I found this looking the assemble code of Linux kernel.
Is this inefficient stack usage a regression?
I made a simple test case.

test.c:
#define copy_from_asm(x, addr, err)	\
asm volatile(				\
	"1:\tmovl %2, %1\n"		\
	"2:\n"				\
	".section .fixup,\"ax\"\n"	\
	"\txor %1,%1\n"			\
	"\tmov $1,%0\n"			\
	"\tjmp 2b\n"			\
	".previous\n"			\
	: "=r" (err), "=r" (x)		\
	: "m" (*(int*)(addr)))

#define copy_from(x, addr, err)	do {		\
	(err) = 0;				\
	copy_from_asm((x), (addr), (err));	\
} while (0)

#define copy(x, addr)	({		\
	int __err;			\
	copy_from((x), (addr), __err);	\
	__err;				\
})

int src[32];
int dst[32];

#define my_copy(x)	do { err |= copy(dst[x], &src[x]); } while (0)

int test(void)
{
	int err = 0;

	my_copy( 0); my_copy( 1); my_copy( 2); my_copy( 3);
	my_copy( 4); my_copy( 5); my_copy( 6); my_copy( 7);
	my_copy( 8); my_copy( 9); my_copy(10); my_copy(11);
	my_copy(12); my_copy(13); my_copy(14); my_copy(15);
	my_copy(16); my_copy(17); my_copy(18); my_copy(19);
	my_copy(20); my_copy(21); my_copy(22); my_copy(23);
	my_copy(24); my_copy(25); my_copy(26); my_copy(27);
	my_copy(28); my_copy(29); my_copy(30); my_copy(31);

	return err;
}

I compiled this gcc-3.4.6, 4.2.4, 4.3.2 and 4.4.0-20081205 with compile
option "-g -Os -mno-red-zone".
The code size of objects below.
$ size test.o.*
   text	   data	    bss	    dec	    hex	filename
    879	      0	      0	    879	    36f	test.o.34
   1085	      0	      0	   1085	    43d	test.o.42
   1061	      0	      0	   1061	    425	test.o.43
   1137	      0	      0	   1137	    471	test.o.44

gcc-3.4.6 generates
   0:	8b 15 00 00 00 00    	mov    0x0(%rip),%edx        # 6 <test+0x6>
   6:	89 15 00 00 00 00    	mov    %edx,0x0(%rip)        # c <test+0xc>
   c:	8b 15 00 00 00 00    	mov    0x0(%rip),%edx        # 12 <test+0x12>
  12:	89 15 00 00 00 00    	mov    %edx,0x0(%rip)        # 18 <test+0x18>
  18:	09 c8                	or     %ecx,%eax
  1a:	8b 15 00 00 00 00    	mov    0x0(%rip),%edx        # 20 <test+0x20>
  20:	89 15 00 00 00 00    	mov    %edx,0x0(%rip)        # 26 <test+0x26>
  26:	09 c8                	or     %ecx,%eax
  28:	8b 15 00 00 00 00    	mov    0x0(%rip),%edx        # 2e <test+0x2e>
  2e:	89 15 00 00 00 00    	mov    %edx,0x0(%rip)        # 34 <test+0x34>
  34:	09 c8                	or     %ecx,%eax
...

and gcc-4.2 generates
   0:	41 57                	push   %r15
   2:	41 56                	push   %r14
   4:	41 55                	push   %r13
   6:	41 54                	push   %r12
   8:	55                   	push   %rbp
   9:	53                   	push   %rbx
   a:	48 83 ec 48          	sub    $0x48,%rsp
   e:	8b 05 00 00 00 00    	mov    0x0(%rip),%eax        # 14 <test+0x14>
  14:	89 14 24             	mov    %edx,(%rsp)
  17:	89 05 00 00 00 00    	mov    %eax,0x0(%rip)        # 1d <test+0x1d>
  1d:	8b 15 00 00 00 00    	mov    0x0(%rip),%edx        # 23 <test+0x23>
  23:	89 15 00 00 00 00    	mov    %edx,0x0(%rip)        # 29 <test+0x29>
  29:	8b 15 00 00 00 00    	mov    0x0(%rip),%edx        # 2f <test+0x2f>
...
 1d8:	0b 44 24 04          	or     0x4(%rsp),%eax
 1dc:	0b 44 24 08          	or     0x8(%rsp),%eax
 1e0:	0b 44 24 0c          	or     0xc(%rsp),%eax
 1e4:	0b 44 24 10          	or     0x10(%rsp),%eax
 1e8:	0b 44 24 14          	or     0x14(%rsp),%eax
...

On gcc-4.x error values stored to stack and at last "or" all stored data.
On the other hand gcc-3.4.6, error value is evaluated each time.

Thanks,
Hiroshi