(I'll cross-post this to gcc and keep it on gcc-help after that.) On Thu, Oct 6, 2011 at 4:46 PM, Andrew Haley <aph@xxxxxxxxxx> wrote: > > inline int8_t as_signed_8 (unsigned int a) { > a &= 0xff; > return a & 0x80 ? (int)a - 0x100 : a; > } > > int overflow(unsigned int a, unsigned int b) { > int sum = as_signed_8(a) + as_signed_8(b); > return as_signed_8(sum) != sum; > } > > Andrew. > That's a really neat trick, and seems to generate identical code. Thanks! I'd be interesting to know if this version produces equally efficient code with MSVC. To summarize what we have so far, here's four different methods along with the code generated for X86 and ARM (GCC 4.5.2): #include <inttypes.h> inline int8_t as_signed_8(unsigned int a) { a &= 0xff; return a & 0x80 ? (int)a - 0x100 : a; } bool overflow_range(unsigned int a, unsigned int b) { const int sum = as_signed_8(a) + as_signed_8(b); return sum < -128 || sum > 127; } bool overflow_bit(unsigned int a, unsigned int b) { const unsigned int sum = a + b; return ~(a ^ b) & (a ^ sum) & 0x80; } bool overflow_unsafe(unsigned int a, unsigned int b) { const unsigned int sum = (int8_t)a + (int8_t)b; return (int8_t)sum != sum; } bool overflow_safe(unsigned int a, unsigned int b) { const int sum = as_signed_8(a) + as_signed_8(b); return as_signed_8(sum) != sum; } Output for X86 with -O3 -fomit-frame-pointer: 00000000 <_Z14overflow_rangejj>: 0: 0f be 54 24 04 movsbl 0x4(%esp),%edx 5: 0f be 44 24 08 movsbl 0x8(%esp),%eax a: 8d 84 02 80 00 00 00 lea 0x80(%edx,%eax,1),%eax 11: 3d ff 00 00 00 cmp $0xff,%eax 16: 0f 97 c0 seta %al 19: c3 ret 1a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 00000020 <_Z12overflow_bitjj>: 20: 8b 54 24 08 mov 0x8(%esp),%edx 24: 8b 4c 24 04 mov 0x4(%esp),%ecx 28: 89 d0 mov %edx,%eax 2a: 31 c8 xor %ecx,%eax 2c: 01 ca add %ecx,%edx 2e: 31 ca xor %ecx,%edx 30: f7 d0 not %eax 32: 21 d0 and %edx,%eax 34: a8 80 test $0x80,%al 36: 0f 95 c0 setne %al 39: c3 ret 3a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 00000040 <_Z15overflow_unsafejj>: 40: 0f be 54 24 08 movsbl 0x8(%esp),%edx 45: 0f be 44 24 04 movsbl 0x4(%esp),%eax 4a: 8d 04 02 lea (%edx,%eax,1),%eax 4d: 0f be d0 movsbl %al,%edx 50: 39 c2 cmp %eax,%edx 52: 0f 95 c0 setne %al 55: c3 ret 56: 8d 76 00 lea 0x0(%esi),%esi 59: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi 00000060 <_Z13overflow_safejj>: 60: 0f be 54 24 08 movsbl 0x8(%esp),%edx 65: 0f be 44 24 04 movsbl 0x4(%esp),%eax 6a: 8d 04 02 lea (%edx,%eax,1),%eax 6d: 0f be d0 movsbl %al,%edx 70: 39 c2 cmp %eax,%edx 72: 0f 95 c0 setne %al 75: c3 ret Output for ARM with -O3 -fomit-frame-pointer -mthumb -march=armv7: 00000000 <_Z14overflow_rangejj>: 0: b249 sxtb r1, r1 2: b240 sxtb r0, r0 4: 1808 adds r0, r1, r0 6: 3080 adds r0, #128 ; 0x80 8: 28ff cmp r0, #255 ; 0xff a: bf94 ite ls c: 2000 movls r0, #0 e: 2001 movhi r0, #1 10: 4770 bx lr 12: bf00 nop 14: f3af 8000 nop.w 18: f3af 8000 nop.w 1c: f3af 8000 nop.w 00000020 <_Z12overflow_bitjj>: 20: 180b adds r3, r1, r0 22: 4041 eors r1, r0 24: ea83 0200 eor.w r2, r3, r0 28: ea22 0001 bic.w r0, r2, r1 2c: f3c0 10c0 ubfx r0, r0, #7, #1 30: 4770 bx lr 32: bf00 nop 34: f3af 8000 nop.w 38: f3af 8000 nop.w 3c: f3af 8000 nop.w 00000040 <_Z15overflow_unsafejj>: 40: b242 sxtb r2, r0 42: b249 sxtb r1, r1 44: 1888 adds r0, r1, r2 46: b243 sxtb r3, r0 48: 1a18 subs r0, r3, r0 4a: bf18 it ne 4c: 2001 movne r0, #1 4e: 4770 bx lr 00000050 <_Z13overflow_safejj>: 50: b242 sxtb r2, r0 52: b249 sxtb r1, r1 54: 1888 adds r0, r1, r2 56: b243 sxtb r3, r0 58: 1a18 subs r0, r3, r0 5a: bf18 it ne 5c: 2001 movne r0, #1 5e: 4770 bx lr Not sure which version would be fastest on ARM (no device to benchmark on handy). By the way, what's a nice way to benchmark snippets like this with optimization on? If you call each function in a loop from a different compilation unit the call overhead tends to dominate. If you instead put it in the same compilation unit and inline, the compiler might do things you do not expect that renders the benchmark useless. /Ulf