In C, how to make GCC recognize subtraction with borrow?

Vincent Lefevre <vincent+gcc@xxxxxxxxxx> · Fri, 20 Jan 2017 16:47:09 +0100

In order to do a multiple-precision subtraction on 3 words, one
needs to make the compiler recognize subtraction with borrow to
get optimal code. For instance, on x86_64, one should get a sub
and 2 sbb's. But how can one do this in C (with no inline asm)?

I've tried the following:

typedef unsigned long T;

void sub1 (T *p, T u0, T u1, T u2, T v0, T v1, T v2)
{
  T t1;
  int b0, b1;

  p[0] = u0 - v0;
  b0 = u0 < v0;
  t1 = u1 - v1;
  b1 = u1 < v1;
  p[1] = t1 - b0;
  b1 |= p[1] > t1;
  p[2] = u2 - v2 - b1;
}

void sub2 (T *p, T u0, T u1, T u2, T v0, T v1, T v2)
{
  int b0, b1;

  p[0] = u0 - v0;
  b0 = u0 < v0;
  p[1] = u1 - v1 - b0;
  b1 = u1 < v1 || (u1 == v1 && b0 != 0);
  p[2] = u2 - v2 - b1;
}

but on x86_64, I get with GCC 6.3.0, using -O3:

0000000000000000 <sub1>:
   0:   48 89 f0                mov    %rsi,%rax
   3:   4c 29 c0                sub    %r8,%rax
   6:   4c 39 ca                cmp    %r9,%rdx
   9:   48 89 07                mov    %rax,(%rdi)
   c:   0f 92 c0                setb   %al
   f:   4c 29 ca                sub    %r9,%rdx
  12:   4c 39 c6                cmp    %r8,%rsi
  15:   49 89 d0                mov    %rdx,%r8
  18:   40 0f 92 c6             setb   %sil
  1c:   31 d2                   xor    %edx,%edx
  1e:   40 0f b6 f6             movzbl %sil,%esi
  22:   49 29 f0                sub    %rsi,%r8
  25:   0f 92 c2                setb   %dl
  28:   48 2b 4c 24 08          sub    0x8(%rsp),%rcx
  2d:   4c 89 47 08             mov    %r8,0x8(%rdi)
  31:   09 d0                   or     %edx,%eax
  33:   0f b6 c0                movzbl %al,%eax
  36:   48 29 c1                sub    %rax,%rcx
  39:   48 89 4f 10             mov    %rcx,0x10(%rdi)
  3d:   c3                      retq   
  3e:   66 90                   xchg   %ax,%ax

0000000000000040 <sub2>:
  40:   48 89 f0                mov    %rsi,%rax
  43:   4c 29 c0                sub    %r8,%rax
  46:   4c 39 c6                cmp    %r8,%rsi
  49:   48 89 d6                mov    %rdx,%rsi
  4c:   41 0f 92 c0             setb   %r8b
  50:   48 89 07                mov    %rax,(%rdi)
  53:   4c 29 ce                sub    %r9,%rsi
  56:   41 0f b6 c0             movzbl %r8b,%eax
  5a:   48 29 c6                sub    %rax,%rsi
  5d:   4c 39 ca                cmp    %r9,%rdx
  60:   0f 94 c0                sete   %al
  63:   48 89 77 08             mov    %rsi,0x8(%rdi)
  67:   44 21 c0                and    %r8d,%eax
  6a:   4c 39 ca                cmp    %r9,%rdx
  6d:   ba 01 00 00 00          mov    $0x1,%edx
  72:   0f 42 c2                cmovb  %edx,%eax
  75:   48 2b 4c 24 08          sub    0x8(%rsp),%rcx
  7a:   0f b6 c0                movzbl %al,%eax
  7d:   48 29 c1                sub    %rax,%rcx
  80:   48 89 4f 10             mov    %rcx,0x10(%rdi)
  84:   c3                      retq   

Note that for sub1, Clang 3.9 seems better:

0000000000000000 <sub1>:
   0:   4c 29 ca                sub    %r9,%rdx
   3:   18 c0                   sbb    %al,%al
   5:   4c 29 c6                sub    %r8,%rsi
   8:   48 89 37                mov    %rsi,(%rdi)
   b:   48 89 d6                mov    %rdx,%rsi
   e:   48 83 de 00             sbb    $0x0,%rsi
  12:   48 89 77 08             mov    %rsi,0x8(%rdi)
  16:   48 39 f2                cmp    %rsi,%rdx
  19:   18 d2                   sbb    %dl,%dl
  1b:   08 c2                   or     %al,%dl
  1d:   80 e2 01                and    $0x1,%dl
  20:   0f b6 c2                movzbl %dl,%eax
  23:   48 2b 4c 24 08          sub    0x8(%rsp),%rcx
  28:   48 29 c1                sub    %rax,%rcx
  2b:   48 89 4f 10             mov    %rcx,0x10(%rdi)
  2f:   c3                      retq   

but still largely suboptimal concerning the subtractions.

Actually, in sub2, the following is a bit better for the b1 line:

  b1 = u1 < v1 + b0 || v1 + b0 < v1;

I don't know much about x86_64, so that I can't give what I would
expect exactly. I've mentioned x86_64 here because this is what my
machine and many other machines have, but of course, this should
remain valid for other processors with a subtraction-with-borrow
instruction, I mean with borrow in and borrow out.

There is probably the same issue with addition.

-- 
Vincent Lefèvre <vincent@xxxxxxxxxx> - Web: <https://www.vinc17.net/>
100% accessible validated (X)HTML - Blog: <https://www.vinc17.net/blog/>
Work: CR INRIA - computer arithmetic / AriC project (LIP, ENS-Lyon)