I've tried the following:
typedef unsigned long T;
void sub1 (T *p, T u0, T u1, T u2, T v0, T v1, T v2)
{
T t1;
int b0, b1;
p[0] = u0 - v0;
b0 = u0 < v0;
t1 = u1 - v1;
b1 = u1 < v1;
p[1] = t1 - b0;
b1 |= p[1] > t1;
p[2] = u2 - v2 - b1;
}
void sub2 (T *p, T u0, T u1, T u2, T v0, T v1, T v2)
{
int b0, b1;
p[0] = u0 - v0;
b0 = u0 < v0;
p[1] = u1 - v1 - b0;
b1 = u1 < v1 || (u1 == v1 && b0 != 0);
p[2] = u2 - v2 - b1;
}
but on x86_64, I get with GCC 6.3.0, using -O3:
0000000000000000 <sub1>:
0: 48 89 f0 mov %rsi,%rax
3: 4c 29 c0 sub %r8,%rax
6: 4c 39 ca cmp %r9,%rdx
9: 48 89 07 mov %rax,(%rdi)
c: 0f 92 c0 setb %al
f: 4c 29 ca sub %r9,%rdx
12: 4c 39 c6 cmp %r8,%rsi
15: 49 89 d0 mov %rdx,%r8
18: 40 0f 92 c6 setb %sil
1c: 31 d2 xor %edx,%edx
1e: 40 0f b6 f6 movzbl %sil,%esi
22: 49 29 f0 sub %rsi,%r8
25: 0f 92 c2 setb %dl
28: 48 2b 4c 24 08 sub 0x8(%rsp),%rcx
2d: 4c 89 47 08 mov %r8,0x8(%rdi)
31: 09 d0 or %edx,%eax
33: 0f b6 c0 movzbl %al,%eax
36: 48 29 c1 sub %rax,%rcx
39: 48 89 4f 10 mov %rcx,0x10(%rdi)
3d: c3 retq
3e: 66 90 xchg %ax,%ax
0000000000000040 <sub2>:
40: 48 89 f0 mov %rsi,%rax
43: 4c 29 c0 sub %r8,%rax
46: 4c 39 c6 cmp %r8,%rsi
49: 48 89 d6 mov %rdx,%rsi
4c: 41 0f 92 c0 setb %r8b
50: 48 89 07 mov %rax,(%rdi)
53: 4c 29 ce sub %r9,%rsi
56: 41 0f b6 c0 movzbl %r8b,%eax
5a: 48 29 c6 sub %rax,%rsi
5d: 4c 39 ca cmp %r9,%rdx
60: 0f 94 c0 sete %al
63: 48 89 77 08 mov %rsi,0x8(%rdi)
67: 44 21 c0 and %r8d,%eax
6a: 4c 39 ca cmp %r9,%rdx
6d: ba 01 00 00 00 mov $0x1,%edx
72: 0f 42 c2 cmovb %edx,%eax
75: 48 2b 4c 24 08 sub 0x8(%rsp),%rcx
7a: 0f b6 c0 movzbl %al,%eax
7d: 48 29 c1 sub %rax,%rcx
80: 48 89 4f 10 mov %rcx,0x10(%rdi)
84: c3 retq
Note that for sub1, Clang 3.9 seems better:
0000000000000000 <sub1>:
0: 4c 29 ca sub %r9,%rdx
3: 18 c0 sbb %al,%al
5: 4c 29 c6 sub %r8,%rsi
8: 48 89 37 mov %rsi,(%rdi)
b: 48 89 d6 mov %rdx,%rsi
e: 48 83 de 00 sbb $0x0,%rsi
12: 48 89 77 08 mov %rsi,0x8(%rdi)
16: 48 39 f2 cmp %rsi,%rdx
19: 18 d2 sbb %dl,%dl
1b: 08 c2 or %al,%dl
1d: 80 e2 01 and $0x1,%dl
20: 0f b6 c2 movzbl %dl,%eax
23: 48 2b 4c 24 08 sub 0x8(%rsp),%rcx
28: 48 29 c1 sub %rax,%rcx
2b: 48 89 4f 10 mov %rcx,0x10(%rdi)
2f: c3 retq
but still largely suboptimal concerning the subtractions.
Actually, in sub2, the following is a bit better for the b1 line:
b1 = u1 < v1 + b0 || v1 + b0 < v1;
I don't know much about x86_64, so that I can't give what I would
expect exactly. I've mentioned x86_64 here because this is what my
machine and many other machines have, but of course, this should
remain valid for other processors with a subtraction-with-borrow
instruction, I mean with borrow in and borrow out.
There is probably the same issue with addition.