Re: Option to make unsigned->signed conversion always well-defined?

Ulf Magnusson <ulfalizer@xxxxxxxxx> · Thu, 6 Oct 2011 20:24:32 +0200

(I'll cross-post this to gcc and keep it on gcc-help after that.)

On Thu, Oct 6, 2011 at 4:46 PM, Andrew Haley <aph@xxxxxxxxxx> wrote:
>
> inline int8_t as_signed_8 (unsigned int a) {
>  a &= 0xff;
>  return a & 0x80 ? (int)a - 0x100 : a;
> }
>
> int overflow(unsigned int a, unsigned int b) {
>  int sum = as_signed_8(a) + as_signed_8(b);
>  return as_signed_8(sum) != sum;
> }
>
> Andrew.
>

That's a really neat trick, and seems to generate identical code. Thanks!

I'd be interesting to know if this version produces equally efficient
code with MSVC.

To summarize what we have so far, here's four different methods along
with the code generated for X86 and ARM (GCC 4.5.2):

#include <inttypes.h>

inline int8_t as_signed_8(unsigned int a) {
    a &= 0xff;
    return a & 0x80 ? (int)a - 0x100 : a;
}

bool overflow_range(unsigned int a, unsigned int b) {
    const int sum = as_signed_8(a) + as_signed_8(b);
    return sum < -128 || sum > 127;
}

bool overflow_bit(unsigned int a, unsigned int b) {
    const unsigned int sum = a + b;
    return ~(a ^ b) & (a ^ sum) & 0x80;
}

bool overflow_unsafe(unsigned int a, unsigned int b) {
    const unsigned int sum = (int8_t)a + (int8_t)b;
    return (int8_t)sum != sum;
}

bool overflow_safe(unsigned int a, unsigned int b) {
    const int sum = as_signed_8(a) + as_signed_8(b);
    return as_signed_8(sum) != sum;
}

Output for X86 with -O3 -fomit-frame-pointer:

00000000 <_Z14overflow_rangejj>:
   0:	0f be 54 24 04       	movsbl 0x4(%esp),%edx
   5:	0f be 44 24 08       	movsbl 0x8(%esp),%eax
   a:	8d 84 02 80 00 00 00 	lea    0x80(%edx,%eax,1),%eax
  11:	3d ff 00 00 00       	cmp    $0xff,%eax
  16:	0f 97 c0             	seta   %al
  19:	c3                   	ret
  1a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi

00000020 <_Z12overflow_bitjj>:
  20:	8b 54 24 08          	mov    0x8(%esp),%edx
  24:	8b 4c 24 04          	mov    0x4(%esp),%ecx
  28:	89 d0                	mov    %edx,%eax
  2a:	31 c8                	xor    %ecx,%eax
  2c:	01 ca                	add    %ecx,%edx
  2e:	31 ca                	xor    %ecx,%edx
  30:	f7 d0                	not    %eax
  32:	21 d0                	and    %edx,%eax
  34:	a8 80                	test   $0x80,%al
  36:	0f 95 c0             	setne  %al
  39:	c3                   	ret
  3a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi

00000040 <_Z15overflow_unsafejj>:
  40:	0f be 54 24 08       	movsbl 0x8(%esp),%edx
  45:	0f be 44 24 04       	movsbl 0x4(%esp),%eax
  4a:	8d 04 02             	lea    (%edx,%eax,1),%eax
  4d:	0f be d0             	movsbl %al,%edx
  50:	39 c2                	cmp    %eax,%edx
  52:	0f 95 c0             	setne  %al
  55:	c3                   	ret
  56:	8d 76 00             	lea    0x0(%esi),%esi
  59:	8d bc 27 00 00 00 00 	lea    0x0(%edi,%eiz,1),%edi

00000060 <_Z13overflow_safejj>:
  60:	0f be 54 24 08       	movsbl 0x8(%esp),%edx
  65:	0f be 44 24 04       	movsbl 0x4(%esp),%eax
  6a:	8d 04 02             	lea    (%edx,%eax,1),%eax
  6d:	0f be d0             	movsbl %al,%edx
  70:	39 c2                	cmp    %eax,%edx
  72:	0f 95 c0             	setne  %al
  75:	c3                   	ret

Output for ARM with -O3 -fomit-frame-pointer -mthumb -march=armv7:

00000000 <_Z14overflow_rangejj>:
   0:	b249      	sxtb	r1, r1
   2:	b240      	sxtb	r0, r0
   4:	1808      	adds	r0, r1, r0
   6:	3080      	adds	r0, #128	; 0x80
   8:	28ff      	cmp	r0, #255	; 0xff
   a:	bf94      	ite	ls
   c:	2000      	movls	r0, #0
   e:	2001      	movhi	r0, #1
  10:	4770      	bx	lr
  12:	bf00      	nop
  14:	f3af 8000 	nop.w
  18:	f3af 8000 	nop.w
  1c:	f3af 8000 	nop.w

00000020 <_Z12overflow_bitjj>:
  20:	180b      	adds	r3, r1, r0
  22:	4041      	eors	r1, r0
  24:	ea83 0200 	eor.w	r2, r3, r0
  28:	ea22 0001 	bic.w	r0, r2, r1
  2c:	f3c0 10c0 	ubfx	r0, r0, #7, #1
  30:	4770      	bx	lr
  32:	bf00      	nop
  34:	f3af 8000 	nop.w
  38:	f3af 8000 	nop.w
  3c:	f3af 8000 	nop.w

00000040 <_Z15overflow_unsafejj>:
  40:	b242      	sxtb	r2, r0
  42:	b249      	sxtb	r1, r1
  44:	1888      	adds	r0, r1, r2
  46:	b243      	sxtb	r3, r0
  48:	1a18      	subs	r0, r3, r0
  4a:	bf18      	it	ne
  4c:	2001      	movne	r0, #1
  4e:	4770      	bx	lr

00000050 <_Z13overflow_safejj>:
  50:	b242      	sxtb	r2, r0
  52:	b249      	sxtb	r1, r1
  54:	1888      	adds	r0, r1, r2
  56:	b243      	sxtb	r3, r0
  58:	1a18      	subs	r0, r3, r0
  5a:	bf18      	it	ne
  5c:	2001      	movne	r0, #1
  5e:	4770      	bx	lr

Not sure which version would be fastest on ARM (no device to benchmark
on handy).

By the way, what's a nice way to benchmark snippets like this with
optimization on? If you call each function in a loop from a different
compilation unit the call overhead tends to dominate. If you instead
put it in the same compilation unit and inline, the compiler might do
things you do not expect that renders the benchmark useless.

/Ulf