I have been studying the asm generated by a typical clamping function, and I am confused about the results. This is done on an Opteron 6k series compiled with -fverbose-asm, -O3 and -march=native. float clamp(float const x, float const min, float const max) { #if defined (BRANCH) if ( x > max ) return max; else if ( x < min ) return min; else return x; #elif defined (BRANCH2) return x > max ? max : ( x < min ? min : x ); #elif defined (CALL) return __builtin_fminf(__builtin_fmaxf(x, min), max); #else float const t = x < min ? min : x; return t> max ? max : t; #endif } -DBRANCH / -DBRANCH2: The first two approaches are obviously identical, and produce: clamp: .LFB0: .cfi_startproc vucomiss %xmm2, %xmm0 # max, x ja .L3 #, vmaxss %xmm0, %xmm1, %xmm0 # x, min, D.2214 ret .p2align 4,,7 .p2align 3 .L3: vmovaps %xmm2, %xmm0 # max, D.2214 ret .cfi_endproc -DCALL: This one I figured would be great, given the use of builtins: clamp: .LFB0: .cfi_startproc subq $24, %rsp #, .cfi_def_cfa_offset 32 vmovss %xmm2, 12(%rsp) # max, %sfp call fmaxf # vmovss 12(%rsp), %xmm2 # %sfp, max addq $24, %rsp #, .cfi_def_cfa_offset 8 vmovaps %xmm2, %xmm1 # max, jmp fminf # .cfi_endproc But then we have what appears to be the best of them all.... just a couple instructions, no branches, no calls, nothing: .LFB0: .cfi_startproc vmaxss %xmm0, %xmm1, %xmm0 # x, min, D.2219 vminss %xmm0, %xmm2, %xmm0 # D.2219, max, D.2219 ret .cfi_endproc So I'm curious.... why is the last approach optimized better than the naive approach of some nested if statements?