Hello, I noticed that the stack usage of the code gcc-4.x generated looks inefficient on x86 and x86_64. I found this looking the assemble code of Linux kernel. Is this inefficient stack usage a regression? I made a simple test case. test.c: #define copy_from_asm(x, addr, err) \ asm volatile( \ "1:\tmovl %2, %1\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "\txor %1,%1\n" \ "\tmov $1,%0\n" \ "\tjmp 2b\n" \ ".previous\n" \ : "=r" (err), "=r" (x) \ : "m" (*(int*)(addr))) #define copy_from(x, addr, err) do { \ (err) = 0; \ copy_from_asm((x), (addr), (err)); \ } while (0) #define copy(x, addr) ({ \ int __err; \ copy_from((x), (addr), __err); \ __err; \ }) int src[32]; int dst[32]; #define my_copy(x) do { err |= copy(dst[x], &src[x]); } while (0) int test(void) { int err = 0; my_copy( 0); my_copy( 1); my_copy( 2); my_copy( 3); my_copy( 4); my_copy( 5); my_copy( 6); my_copy( 7); my_copy( 8); my_copy( 9); my_copy(10); my_copy(11); my_copy(12); my_copy(13); my_copy(14); my_copy(15); my_copy(16); my_copy(17); my_copy(18); my_copy(19); my_copy(20); my_copy(21); my_copy(22); my_copy(23); my_copy(24); my_copy(25); my_copy(26); my_copy(27); my_copy(28); my_copy(29); my_copy(30); my_copy(31); return err; } I compiled this gcc-3.4.6, 4.2.4, 4.3.2 and 4.4.0-20081205 with compile option "-g -Os -mno-red-zone". The code size of objects below. $ size test.o.* text data bss dec hex filename 879 0 0 879 36f test.o.34 1085 0 0 1085 43d test.o.42 1061 0 0 1061 425 test.o.43 1137 0 0 1137 471 test.o.44 gcc-3.4.6 generates 0: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 6 <test+0x6> 6: 89 15 00 00 00 00 mov %edx,0x0(%rip) # c <test+0xc> c: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 12 <test+0x12> 12: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 18 <test+0x18> 18: 09 c8 or %ecx,%eax 1a: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 20 <test+0x20> 20: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 26 <test+0x26> 26: 09 c8 or %ecx,%eax 28: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 2e <test+0x2e> 2e: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 34 <test+0x34> 34: 09 c8 or %ecx,%eax ... and gcc-4.2 generates 0: 41 57 push %r15 2: 41 56 push %r14 4: 41 55 push %r13 6: 41 54 push %r12 8: 55 push %rbp 9: 53 push %rbx a: 48 83 ec 48 sub $0x48,%rsp e: 8b 05 00 00 00 00 mov 0x0(%rip),%eax # 14 <test+0x14> 14: 89 14 24 mov %edx,(%rsp) 17: 89 05 00 00 00 00 mov %eax,0x0(%rip) # 1d <test+0x1d> 1d: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 23 <test+0x23> 23: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 29 <test+0x29> 29: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 2f <test+0x2f> ... 1d8: 0b 44 24 04 or 0x4(%rsp),%eax 1dc: 0b 44 24 08 or 0x8(%rsp),%eax 1e0: 0b 44 24 0c or 0xc(%rsp),%eax 1e4: 0b 44 24 10 or 0x10(%rsp),%eax 1e8: 0b 44 24 14 or 0x14(%rsp),%eax ... On gcc-4.x error values stored to stack and at last "or" all stored data. On the other hand gcc-3.4.6, error value is evaluated each time. Thanks, Hiroshi