Intel x86 optimization question (longish)

Jens Kilian <Jens_Kilian@xxxxxxxxxxx> · Fri, 28 Jan 2005 14:54:56 +0100

I'm trying to get GCC to emit the best possible code for the following
C source, which is supposed to represent the inner (dispatching) loop
of a virtual machine for a functional or logical language:

---8<--------------------------------------------------------------------8<---
#define REGPARM __attribute__((regparm(2)))

struct T
{
  const struct T * REGPARM (*f1)(void);
  const struct T * REGPARM (*f2)(const struct T *);
  const struct T * REGPARM (*f3)(const struct T *, void *);
};

void
foo(const struct T *t)
{
  while (t) {
    t = (t->f1)();
  }
}

void
bar(const struct T *t)
{
  while (t) {
    t = (t->f2)(t);
  }
}

void
baz(const struct T *t, void *p)
{
  while (t) {
    t = (t->f3)(t, p);
  }
}
---8<--------------------------------------------------------------------8<---

GCC (3.2.3) with -O3 generates the following code for the three loops:

---8<--------------------------------------------------------------------8<---
.L11:
        call    *(%eax)          t = (t->f1)();

        testl   %eax, %eax       while (t)
        jne     .L11
...
.L21:
        movl    %edx, %eax	 t = (t->f2)(t);
        call    *4(%edx)
        movl    %eax, %edx

        testl   %edx, %edx       while (t)
        jne     .L21
...
.L31:
        movl    %ecx, %eax	t = (t->f3)(t, p);
        movl    %ebx, %edx
        call    *8(%ecx)
        movl    %eax, %ecx

        testl   %ecx, %ecx	while (t)
        jne     .L31
---8<--------------------------------------------------------------------8<---

In the second and third case, the return value is unnecessarily moved from
%eax to a temporary register.  Indeed, when using "-O3 -funroll-all-loops",
the second one looks like this:

---8<--------------------------------------------------------------------8<---
.L29:
        movl    %edx, %eax
        call    *4(%edx)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        je      .L31
        call    *4(%eax)
        testl   %eax, %eax
        movl    %eax, %edx
        jne     .L29
.L31:
---8<--------------------------------------------------------------------8<---

Is there any better way to eliminate the redundancy (apart from using inline
assembler ;-)?

Thanks,
	Jens.
-- 
mailto:jjk@xxxxxxx                 phone:+49-7031-464-7698 (TELNET 778-7698)
  http://www.bawue.de/~jjk/          fax:+49-7031-464-7351
                                   As the air to a bird, or the sea to a fish,
                                   so is contempt to the contemptible. [Blake]