The snippet of code below is part of a much larger module and was compiled on an FC6 system with gcc 4.1.1 20061011 on linux, kernel 2.6.18-1.2869. To say that we were disappointed with the emitted assembler would be an understatement. Compile options were -O3 -fomit-frame-pointer -march=i686 -fPIC void (__attribute__ regparm(2) z900_load) (BYTE inst[], REGS *regs) { int r1; int b2; U64 effective_addr2; U32 temp = bswap_32(*(U32*)inst); r1 = (temp >> 20) & 0xf; b2 = (temp >> 16) & 0xf; effective_addr2 = temp & 0xfff; if (b2) effective_addr2 += regs->gr[b2].D; // U64 b2 = (temp >> 12) & 0xf; if (b2) effective_addr2 += regs->gr[b2].D; // U64 effective_addr2 &= regs->psw.amask.D; // U64 regs->ip += 4; regs->ilc = 4; if ((effective_addr2 & 3) == 0) . . . . The assembler is below with noted lines: z900_load: pushl %ebp pushl %edi xorl %edi, %edi pushl %esi subl $96, %esp movl (%eax), %eax [ 7] movl %edx, 24(%esp) [ 8] movl %eax, 28(%esp) #APP bswap %eax #NO_APP movl %eax, %ecx [11] movl %eax, 28(%esp) [12] movl 28(%esp), %eax shrl $16, %ecx movl %eax, %esi movl %ecx, %eax andl $4095, %esi andl $15, %eax je .L13528 [19] movl 24(%esp), %edx addl 80(%edx,%eax,8), %esi adcl 84(%edx,%eax,8), %edi .L13528: movl 28(%esp), %eax shrl $12, %eax andl $15, %eax movl %eax, 76(%esp) je .L13530 movl %eax, %ecx [28] movl 24(%esp), %eax addl 80(%eax,%ecx,8), %esi adcl 84(%eax,%ecx,8), %edi .L13530: [31] movl 24(%esp), %edx movl 32(%edx), %ecx addl $4, 44(%edx) andl %esi, %ecx movl %ecx, 64(%esp) movl 36(%edx), %eax andl %edi, %eax movl %eax, 68(%esp) movb $4, 42(%edx) movl 64(%esp), %eax [41] xorl %edx, %edx . movl %edx, %ecx . andl $3, %eax . orl %eax, %ecx [45] jne .L13602 On entry, %eax points to inst, %edx points to REGS. Variable regs (%edx) is stacked on line 7 and is reloaded from the stack in lines 19, 28 and 31 despite %edx not being clobbered until line 41. The 4 byte value pointed to by inst (%eax) is loaded into %eax and then stacked before bswap (line 8), then stacked again after bswap (line 11). To add insult to injury, line 12 reloads %eax from the stack. Lines 41..45 all deal with trying to figure out if the low-order 2 bits of effective_addr2 are zero. All I can say is, WTF? I can get around this one by casting effective_addr2 to U32 and then testl/jne is emitted, but I shouldn't have to do this?? Does anyone have any explanations? I was drawn to this particular code because an automated benchmark started flagging this routine because the performance decreased so much. Greg Smith