On 01/06/2023 09:42, Mason wrote: > As far as I can tell, intrinsics _addcarry_u64() and _addcarryx_u64() are > plain wrappers around the same __builtin_ia32_addcarryx_u64() function. > > https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/adxintrin.h Hello Uros, Jakub, I want to report a missed-optimization bug with _addcarry_u64(). (I can file an issue on Bugzilla, if you deem it appropriate.) #include <x86intrin.h> typedef unsigned long long u64; typedef unsigned __int128 u128; void testcase1(u64 *acc, u64 a, u64 b) { u128 res = (u128)a*b; u64 lo = res, hi = res >> 64; unsigned char cf = 0; cf = _addcarry_u64(cf, lo, acc[0], acc+0); cf = _addcarry_u64(cf, hi, acc[1], acc+1); cf = _addcarry_u64(cf, 0, acc[2], acc+2); } void testcase2(u64 *acc, u64 a, u64 b) { u128 res = (u128)a * b; u64 lo = res, hi = res >> 64; asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]\n\t" "adc $0, %[D2]" : [D0] "+m" (acc[0]), [D1] "+m" (acc[1]), [D2] "+m" (acc[2]) : [LO] "r" (lo), [HI] "r" (hi) : "cc"); } gcc-trunk -Wall -Wextra -O3 -S testcase.c (Same code generated with -Os) /*** rdi = acc, rsi = a, rdx = b ***/ testcase1: movq %rsi, %rax mulq %rdx addq %rax, (%rdi) movq %rdx, %rax adcq 8(%rdi), %rax adcq $0, 16(%rdi) movq %rax, 8(%rdi) ret testcase2: movq %rsi, %rax ; rax = rsi = a mulq %rdx ; rdx:rax = rax*rdx = a*b add %rax, (%rdi) ; acc[0] += lo adc %rdx, 8(%rdi) ; acc[1] += hi + cf adc $0, 16(%rdi) ; acc[2] += cf ret As you can see, gcc generates the expected code for testcase2, but it generates sub-optimal code for testcase1: movq %rdx, %rax adcq 8(%rdi), %rax movq %rax, 8(%rdi) instead of adc %rdx, 8(%rdi) ; acc[1] += hi + cf Do you know why it's missing the optimization? Regards