GCC asm block optimizations on x86_64

"Darryl L. Miles" <darryl-mailinglists@xxxxxxxxxxxx> · Mon, 27 Aug 2007 06:11:04 +0100

Please find enclosed a small test sample of code.  The comments 
contained within it explain related points in the generated code.

[1] This issue is in the way %edx is zero'ed, I would think zeroing out 
registers/memory/whatever would be a special optimization case in this 
code its clear that there is no useful value in the CPU condition flags, 
so "xorl %edx,%edx" would make most sense, instead of having to find 
another register to load with zero before then copying.  Interestingly 
enough -O generates a "mov $0,%r8d", while -O2 generates a "xor %r8d,%r8d".

[2] No issue here, this was just a useful note to explain why %r9 was 
brought into play.  This is due to the constraints of RDX:RAX within the 
DIV instruction and RDX is used to pass the 3rd function argument.

[3] Since %r8 was brought into play by the compiler generated code, I 
take it that %r8 is a caller saves in the ABI.  So as we have a register 
free for use, (even after we may have just used it to zero %edx in issue 
[1] above).  So using %r8 here would be a much better option for the 
purpose of what %ebx is allocated for.  I take it that %ebx is a callee 
saves so we get a push followed by pop, which is unnecessary memory 
access when we have a register available.

On a side track, when I looked at the code generated for main() "objdump 
-d u64_divide" I could see that the u64buf structures appear to be 
aligned to 16 bytes, instead of seeing that value.u32.lw0 and 
value.u64.ll0 compute to offset 0, seeing the largest type width was 8 
bytes and then aligning according to that.  Maybe there is another 
reason for this?

If it is not possible to get GCC to emit assembly code nearer the ideal, 
does this test case provide anything useful to gain an understanding from.

Darryl

#include <stdio.h>
#include <sys/types.h>

/* GNU/Linux with GCC always has U_INT64_T */
#define HAVE_TYPE_U_INT64_T

struct u64buf {
	union {
		struct {
			u_int32_t lw0;
			u_int32_t lw1;
		} u32;
#if defined(HAVE_TYPE_U_INT64_T)
		struct {
			u_int64_t ll0;
		} u64;
#endif
	} value;
};
typedef struct u64buf u64buf_t;

#if defined(__i386__)
#endif /* __i386__ */

#if defined(__x86_64__)
	/* This is UNTESTED divq does 128bit by 64bit divide with RDX:RAX
	 *  so we ensure RDX=0 and RAX=dividend.
	 *
	 * objdump -d u64_divide.o (after "GCC -O2")
	 *
	 * 0000000000000090 <u64_divide>:
	 *    0:   45 31 c0                xor    %r8d,%r8d	<<- [1] why can't this be removed ?
  	 *    3:   49 89 d1                mov    %rdx,%r9	<<- [2] save %rdx in %r9 for arg-as-return
	 *    6:   53                      push   %rbx		<<- [3] why can't this be removed ?
  	 *    7:   48 8b 07                mov    (%rdi),%rax
	 *    a:   44 89 c2                mov    %r8d,%edx	<<- [1] why can't this be "xorl %edx,%edx"
  	 *    d:   31 db                   xor    %ebx,%ebx	<<- [3] why can't we select %r8 here and remove the %rbx saving
	 *    f:   48 f7 36                divq   (%rsi)
  	 *   12:   73 02                   jae    16 <u64_divide+0x16>
	 *   14:   ff c3                   inc    %ebx		<<- [3] why can't we select %r8 here and remove the %rbx saving
  	 *   16:   49 89 01                mov    %rax,(%r9)	<<- [2] use saved %rdx to return argument
	 *   19:   48 89 11                mov    %rdx,(%rcx)
  	 *   1c:   89 d8                   mov    %ebx,%eax
	 *   1e:   5b                      pop    %rbx		<<- [3] why can't this be removed ?
  	 *   1f:   c3                      retq
  	 *
  	 *
  	 * My ideal hand optimized:
	 *  ABI rules	arg1	%rdi	(dividend)
	 *		arg2	%esi	(divisor)
	 *		arg3	%edx	(quotient)
	 *		arg4	%ecx	(remainder)
  	 *
	 * 0000000000000090 <u64_divide>:
  	 *   00:   49 89 d1                mov    %rdx,%r9	<<- [1] save %rdx in %r9 for arg-as-return
  	 *   03:   48 8b 07                mov    (%rdi),%rax
	 *   06:   ?? ?? ??                xor    %edx,%edx	<<- implicit zero of high 32bits, would accept xorq %rdx,%rdx
  	 *   09:   ?? ??                   xor    %r8,%r8
	 *   0b:   48 f7 36                divq   (%rsi)
  	 *   0e:   73 02                   jae    12 <u64_divide+0x12>
	 *   10:   ?? ??                   inc    %r8
  	 *   12:   49 89 01                mov    %rax,(%r9)	<<- [1] use saved %rdx to return argument
	 *   15:   48 89 11                mov    %rdx,(%rcx)
  	 *   18:   89 d8                   mov    %ebx,%eax
  	 *   1a:   c3                      retq
  	 *
	 *
	 * input 0: "=&r" (overflow): ampersand needed because we clobber
	 *  it before we have finished using/saving all the input operands
	 *  (RDX/RAX in particular).
	 */
	#define U64_DIVIDE_ASM_64 1
	#define U64_DIVIDE_ASM(quotient, remainder, dividend, divisor, overflow)	do {	\
		__asm__ __volatile__(						\
			"\n\t"							\
			"xorl %0,%0\n\t"					\
			"divq %5\n\t"						\
										\
			"jnc 1f\n\t"						\
			"incl %0\n"						\
			"1:\n\t"						\
			"movq %%rax,%2\n\t"					\
			"movq %%rdx,%1\n\t"					\
			: "=&g" (overflow),		/* return */		\
			  "=g" (*remainder),					\
			  "=g" (*quotient)					\
			: "d" (0),			/* argument */		\
			  "a" ((*dividend)),					\
			  "g" ((*divisor))					\
			/*: "rax", "rdx", you'd think you need this to */	\
			/* describe these registers as no longer containing */	\
			/* the assigned input values after asm block */		\
			/* execution, but will not compile witht them set. */	\
										\
			/* side effects: implicit side effects */		\
			/*  flags clobbered, all outputs at their respective */	\
			/*  data widths (overflow=32bit; remainder=64bit; */	\
			/*  quotient=64bit). */					\
		);								\
	} while(0)
#endif /* __x86_64__ */

int
u64_divide(const u64buf_t *dividend_orig, const u64buf_t *divisor_orig, u64buf_t *quotient_orig, u64buf_t *remainder_orig)
{
	int overflow;

#ifdef U64_DIVIDE_ASM
	{
		u_int64_t *quotient = &quotient_orig->value.u64.ll0;
		u_int64_t *remainder = &remainder_orig->value.u64.ll0;
		const u_int64_t *dividend = &dividend_orig->value.u64.ll0;
		const u_int64_t *divisor = &divisor_orig->value.u64.ll0;
		U64_DIVIDE_ASM(quotient, remainder, dividend, divisor, overflow);
	}
#else
 #error "INLINE ASM MISSING"
#endif

	return overflow;
}

int
main(int argc, char *argv[])
{
	u64buf_t dividend, divisor, quotient, remainder;
	int overflow;

	dividend.value.u64.ll0 = 10LL;
	divisor.value.u64.ll0 = 8LL;
	quotient.value.u64.ll0 = -1;
	remainder.value.u64.ll0 = -1;

	printf("sizeof(u64buf_t)=%d\n", sizeof(u64buf_t));
	overflow = u64_divide(&dividend, &divisor, &quotient, &remainder);
	printf("dividend=%llu; divisor=%llu; quotient=%llu; remainder=%llu; overflow=%d\n",
	 dividend.value.u64.ll0, divisor.value.u64.ll0, quotient.value.u64.ll0, remainder.value.u64.ll0, overflow);

	return 0;
}