Re: gcc 4.1.1 poor optimization

Greg Smith <gsmith@xxxxxxxxx> · Wed, 10 Jan 2007 19:56:16 -0500

On Wed, 2007-01-10 at 15:30 -0800, Ian Lance Taylor wrote:
> Greg Smith <gsmith@xxxxxxxxx> writes:
> 
> > [ 7]    movl    %edx, 24(%esp)
> > [ 8]    movl    %eax, 28(%esp)
> > #APP
> >         bswap %eax
> > #NO_APP
> 
> This makes it look like the bswap asm clobbered %edx.
> 
> Can you post a small complete standalone preprocessed test case?  It's
> quite difficult to analyze an incomplete one.  Thanks.
> 

Thanks for the quick response!!

The test case below may not be as small as you want, but it does show a
couple (but not all) of the issues.

typedef unsigned char BYTE;
typedef unsigned long U32;
typedef unsigned long long U64;

#define my_bswap_32(x) \
 (__extension__ \
  ({ register unsigned int __v, __x = (x); \
   __asm__ ("bswap %0" : "=r" (__v) : "0" (__x)); \
   __v; }))

struct REGS {
 U64   gr[16];
 U64   amask;
 void *ip;
 BYTE  ilc;
};
typedef struct REGS REGS;

int (__attribute__ (( regparm(2) )) z900_load) (BYTE inst[], REGS *regs)
{
int r1;
int b2;
U64 effective_addr2;

    U32 temp = my_bswap_32(*(U32*)inst);
    r1 = (temp >> 20) & 0xf;
    b2 = (temp >> 16) & 0xf;
    effective_addr2 = temp & 0xfff;
    if (b2) effective_addr2 += regs->gr[b2];
    b2 = (temp >> 12) & 0xf;
    if (b2) effective_addr2 += regs->gr[b2];
    effective_addr2 &= regs->amask;
    regs->ip += 4;
    regs->ilc = 4;
    if ((effective_addr2 & 3) == 0)
        return 1;
    return 0;
}

compiled with gcc -O3 -fomit-frame-pointer -fPIC -S testload.c

The assembler:

z900_load:
        subl    $16, %esp
        movl    %esi, 4(%esp)
        movl    %edi, 8(%esp)
        xorl    %edi, %edi
        movl    %ebp, 12(%esp)
        movl    (%eax), %ebp
#APP
        bswap %ebp
#NO_APP
        movl    %ebp, %ecx
        movl    %ebp, %esi
        shrl    $16, %ecx
        andl    $4095, %esi
        movl    %ecx, %eax
        andl    $15, %eax
[14]    movl    %edx, (%esp)
        je      .L2
[16]    movl    (%esp), %edx
        addl    (%edx,%eax,8), %esi
        adcl    4(%edx,%eax,8), %edi
.L2:
        shrl    $12, %ebp
        movl    %ebp, %eax
        andl    $15, %eax
        je      .L4
[23]    movl    (%esp), %ecx
        addl    (%ecx,%eax,8), %esi
        adcl    4(%ecx,%eax,8), %edi
.L4:
[26]    movl    (%esp), %ecx
[27]    xorl    %edx, %edx
        andl    %edi, %edx
        movl    128(%ecx), %eax
        addl    $4, 136(%ecx)
        movb    $4, 140(%ecx)
        movl    8(%esp), %edi
        andl    $3, %eax
        movl    12(%esp), %ebp
        andl    %esi, %eax
        movl    4(%esp), %esi
        orl     %edx, %eax
        sete    %al
        addl    $16, %esp
        movzbl  %al, %eax
        ret

Original %edx is stacked on line 14 and is reloaded on lines 16, 23 and
26 until it's clobbered in line 27.  Line 27 seems to be related to the
fix you've been working on.

Greg Smith