Please find enclosed a small test sample of code. The comments contained within it explain related points in the generated code.
[1] This issue is in the way %edx is zero'ed, I would think zeroing out registers/memory/whatever would be a special optimization case in this code its clear that there is no useful value in the CPU condition flags, so "xorl %edx,%edx" would make most sense, instead of having to find another register to load with zero before then copying. Interestingly enough -O generates a "mov $0,%r8d", while -O2 generates a "xor %r8d,%r8d".
[2] No issue here, this was just a useful note to explain why %r9 was brought into play. This is due to the constraints of RDX:RAX within the DIV instruction and RDX is used to pass the 3rd function argument.
[3] Since %r8 was brought into play by the compiler generated code, I take it that %r8 is a caller saves in the ABI. So as we have a register free for use, (even after we may have just used it to zero %edx in issue [1] above). So using %r8 here would be a much better option for the purpose of what %ebx is allocated for. I take it that %ebx is a callee saves so we get a push followed by pop, which is unnecessary memory access when we have a register available.
On a side track, when I looked at the code generated for main() "objdump -d u64_divide" I could see that the u64buf structures appear to be aligned to 16 bytes, instead of seeing that value.u32.lw0 and value.u64.ll0 compute to offset 0, seeing the largest type width was 8 bytes and then aligning according to that. Maybe there is another reason for this?
If it is not possible to get GCC to emit assembly code nearer the ideal, does this test case provide anything useful to gain an understanding from.
Darryl
#include <stdio.h> #include <sys/types.h> /* GNU/Linux with GCC always has U_INT64_T */ #define HAVE_TYPE_U_INT64_T struct u64buf { union { struct { u_int32_t lw0; u_int32_t lw1; } u32; #if defined(HAVE_TYPE_U_INT64_T) struct { u_int64_t ll0; } u64; #endif } value; }; typedef struct u64buf u64buf_t; #if defined(__i386__) #endif /* __i386__ */ #if defined(__x86_64__) /* This is UNTESTED divq does 128bit by 64bit divide with RDX:RAX * so we ensure RDX=0 and RAX=dividend. * * objdump -d u64_divide.o (after "GCC -O2") * * 0000000000000090 <u64_divide>: * 0: 45 31 c0 xor %r8d,%r8d <<- [1] why can't this be removed ? * 3: 49 89 d1 mov %rdx,%r9 <<- [2] save %rdx in %r9 for arg-as-return * 6: 53 push %rbx <<- [3] why can't this be removed ? * 7: 48 8b 07 mov (%rdi),%rax * a: 44 89 c2 mov %r8d,%edx <<- [1] why can't this be "xorl %edx,%edx" * d: 31 db xor %ebx,%ebx <<- [3] why can't we select %r8 here and remove the %rbx saving * f: 48 f7 36 divq (%rsi) * 12: 73 02 jae 16 <u64_divide+0x16> * 14: ff c3 inc %ebx <<- [3] why can't we select %r8 here and remove the %rbx saving * 16: 49 89 01 mov %rax,(%r9) <<- [2] use saved %rdx to return argument * 19: 48 89 11 mov %rdx,(%rcx) * 1c: 89 d8 mov %ebx,%eax * 1e: 5b pop %rbx <<- [3] why can't this be removed ? * 1f: c3 retq * * * My ideal hand optimized: * ABI rules arg1 %rdi (dividend) * arg2 %esi (divisor) * arg3 %edx (quotient) * arg4 %ecx (remainder) * * 0000000000000090 <u64_divide>: * 00: 49 89 d1 mov %rdx,%r9 <<- [1] save %rdx in %r9 for arg-as-return * 03: 48 8b 07 mov (%rdi),%rax * 06: ?? ?? ?? xor %edx,%edx <<- implicit zero of high 32bits, would accept xorq %rdx,%rdx * 09: ?? ?? xor %r8,%r8 * 0b: 48 f7 36 divq (%rsi) * 0e: 73 02 jae 12 <u64_divide+0x12> * 10: ?? ?? inc %r8 * 12: 49 89 01 mov %rax,(%r9) <<- [1] use saved %rdx to return argument * 15: 48 89 11 mov %rdx,(%rcx) * 18: 89 d8 mov %ebx,%eax * 1a: c3 retq * * * input 0: "=&r" (overflow): ampersand needed because we clobber * it before we have finished using/saving all the input operands * (RDX/RAX in particular). */ #define U64_DIVIDE_ASM_64 1 #define U64_DIVIDE_ASM(quotient, remainder, dividend, divisor, overflow) do { \ __asm__ __volatile__( \ "\n\t" \ "xorl %0,%0\n\t" \ "divq %5\n\t" \ \ "jnc 1f\n\t" \ "incl %0\n" \ "1:\n\t" \ "movq %%rax,%2\n\t" \ "movq %%rdx,%1\n\t" \ : "=&g" (overflow), /* return */ \ "=g" (*remainder), \ "=g" (*quotient) \ : "d" (0), /* argument */ \ "a" ((*dividend)), \ "g" ((*divisor)) \ /*: "rax", "rdx", you'd think you need this to */ \ /* describe these registers as no longer containing */ \ /* the assigned input values after asm block */ \ /* execution, but will not compile witht them set. */ \ \ /* side effects: implicit side effects */ \ /* flags clobbered, all outputs at their respective */ \ /* data widths (overflow=32bit; remainder=64bit; */ \ /* quotient=64bit). */ \ ); \ } while(0) #endif /* __x86_64__ */ int u64_divide(const u64buf_t *dividend_orig, const u64buf_t *divisor_orig, u64buf_t *quotient_orig, u64buf_t *remainder_orig) { int overflow; #ifdef U64_DIVIDE_ASM { u_int64_t *quotient = "ient_orig->value.u64.ll0; u_int64_t *remainder = &remainder_orig->value.u64.ll0; const u_int64_t *dividend = ÷nd_orig->value.u64.ll0; const u_int64_t *divisor = &divisor_orig->value.u64.ll0; U64_DIVIDE_ASM(quotient, remainder, dividend, divisor, overflow); } #else #error "INLINE ASM MISSING" #endif return overflow; } int main(int argc, char *argv[]) { u64buf_t dividend, divisor, quotient, remainder; int overflow; dividend.value.u64.ll0 = 10LL; divisor.value.u64.ll0 = 8LL; quotient.value.u64.ll0 = -1; remainder.value.u64.ll0 = -1; printf("sizeof(u64buf_t)=%d\n", sizeof(u64buf_t)); overflow = u64_divide(÷nd, &divisor, "ient, &remainder); printf("dividend=%llu; divisor=%llu; quotient=%llu; remainder=%llu; overflow=%d\n", dividend.value.u64.ll0, divisor.value.u64.ll0, quotient.value.u64.ll0, remainder.value.u64.ll0, overflow); return 0; }