Generated ASM of a typical clamp

NightStrike <nightstrike@xxxxxxxxx> · Mon, 20 Oct 2014 19:18:26 -0400

I have been studying the asm generated by a typical clamping function,
and I am confused about the results.  This is done on an Opteron 6k
series compiled with -fverbose-asm, -O3 and -march=native.

float clamp(float const x, float const min, float const max) {
#if defined (BRANCH)
  if ( x > max )
    return max;
  else if ( x < min )
    return min;
  else
    return x;
#elif defined (BRANCH2)
  return x > max ? max : ( x < min ? min : x );
#elif defined (CALL)
  return __builtin_fminf(__builtin_fmaxf(x, min), max);
#else
  float const t = x < min ? min : x;
  return t> max ? max : t;
#endif
}

-DBRANCH / -DBRANCH2:
The first two approaches are obviously identical, and produce:

clamp:
.LFB0:
        .cfi_startproc
        vucomiss        %xmm2, %xmm0    # max, x
        ja      .L3     #,
        vmaxss  %xmm0, %xmm1, %xmm0     # x, min, D.2214
        ret
        .p2align 4,,7
        .p2align 3
.L3:
        vmovaps %xmm2, %xmm0    # max, D.2214
        ret
        .cfi_endproc

-DCALL:
This one I figured would be great, given the use of builtins:

clamp:
.LFB0:
        .cfi_startproc
        subq    $24, %rsp       #,
        .cfi_def_cfa_offset 32
        vmovss  %xmm2, 12(%rsp) # max, %sfp
        call    fmaxf   #
        vmovss  12(%rsp), %xmm2 # %sfp, max
        addq    $24, %rsp       #,
        .cfi_def_cfa_offset 8
        vmovaps %xmm2, %xmm1    # max,
        jmp     fminf   #
        .cfi_endproc

But then we have what appears to be the best of them all....  just a
couple instructions, no branches, no calls, nothing:

.LFB0:
        .cfi_startproc
        vmaxss  %xmm0, %xmm1, %xmm0     # x, min, D.2219
        vminss  %xmm0, %xmm2, %xmm0     # D.2219, max, D.2219
        ret
        .cfi_endproc

So I'm curious.... why is the last approach optimized better than the
naive approach of some nested if statements?