wrong-code in -ftree-ter on arm?

Hiroki Kaminaga <kaminaga@xxxxxxxxxxxxx> · Tue, 06 Mar 2007 17:06:10 +0900 (JST)

Hi,

The attached codes, snipped from linux kernel source code produce
wrong code above -O1.

$ arm-linux-gcc -O2 -c div64.S
$ arm-linux-gcc -O2 -save-temps dodiv.c div64.o -o dodiv

actual result:
 $ ./dodiv
tbuf: <4>[   52.305419] 

expected result:
tbuf: <4>[    0.305419]

when compiled with -fno-tree-ter, the result was OK.
I'm using gcc-4.1.2.

The problem comes from bad argument passing to sprintf(), looking from
dodiv.s:
...
        mov     r2, r5             <<<<<< (1)
        mov     ip, ip, lsr #6
        mov     r3, r5             <<<<<< (2)
        ldr     r1, .L10+8
        mov     r0, r4
        str     ip, [sp, #0]
        bl      sprintf

Where (1) is loglev_char argument and (2) is (unsigned long)t argument
in sprintf() in dodiv.c. Apparently, the loglev_char is also assigned
to the first %5lu part of sprintf, which is the cause.

I'm not sure if this is a gcc bug, since in div64.S obtained from
linux kernel, comments that it is non standard for optimal code.

Any hint/advice please?

Thanks in Advance.

(Hiroki Kaminaga)
t
--
#include <stdio.h>
#include <alloca.h>

#define __xl "r0"
#define __xh "r1"
#define __asmeq(x, y)  ".ifnc " x "," y " ; .err ; .endif\n\t"

#define do_div(n,base)                                          \
({                                                              \
        register unsigned int __base      asm("r4") = base;     \
        register unsigned long long __n   asm("r0") = n;        \
        register unsigned long long __res asm("r2");            \
        register unsigned int __rem       asm(__xh);            \
        asm(    __asmeq("%0", __xh)                             \
                __asmeq("%1", "r2")                             \
                __asmeq("%2", "r0")                             \
                __asmeq("%3", "r4")                             \
                "bl     __do_div64"                             \
                : "=r" (__rem), "=r" (__res)                    \
                : "r" (__n), "r" (__base)                       \
		    : "ip", "lr", "cc");                        \
        n = __res;                                              \
        __rem;                                                  \
})

#define default_message_loglevel 4
unsigned long long printk_clock(void){
  return (unsigned long long) 0x12345678;
}

int main(void)
{
  int loglev_char;
  char *p;
  char tbuf[50], *tp;
  unsigned tlen;
  unsigned long long t;
  unsigned long nanosec_rem;

  p = alloca(1024);
  if (p[0] == '<' ) {
    loglev_char = p[1];
  } else {
    loglev_char = default_message_loglevel + '0';
  }

  t = printk_clock();
  nanosec_rem = do_div(t, 1000000000);

  tlen = sprintf(tbuf,
		 "<%c>[%5lu.%06lu] ",
		 loglev_char,
		 (unsigned long)t,
		 nanosec_rem/1000);

  printf("tbuf: %s\n", tbuf);
  return 0;
}
/*
 *  linux/arch/arm/lib/div64.S
 *
 *  Optimized computation of 64-bit dividend / 32-bit divisor
 *
 *  Author:	Nicolas Pitre
 *  Created:	Oct 5, 2003
 *  Copyright:	Monta Vista Software, Inc.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */

//#include <linux/linkage.h>
#define ALIGN .align 4,0x90
#define ENTRY(name)               \
	.globl name		; \
	ALIGN			; \
name:

#ifdef __ARMEB__
#define xh r0
#define xl r1
#define yh r2
#define yl r3
#else
#define xl r0
#define xh r1
#define yl r2
#define yh r3
#endif

/*
 * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
 *
 * Note: Calling convention is totally non standard for optimal code.
 *       This is meant to be used by do_div() from include/asm/div64.h only.
 *
 * Input parameters:
 * 	xh-xl	= dividend (clobbered)
 * 	r4	= divisor (preserved)
 *
 * Output values:
 * 	yh-yl	= result
 * 	xh	= remainder
 *
 * Clobbered regs: xl, ip
 */

ENTRY(__do_div64)

	@ Test for easy paths first.
	subs	ip, r4, #1
	bls	9f			@ divisor is 0 or 1
	tst	ip, r4
	beq	8f			@ divisor is power of 2

	@ See if we need to handle upper 32-bit result.
	cmp	xh, r4
	mov	yh, #0
	blo	3f

	@ Align divisor with upper part of dividend.
	@ The aligned divisor is stored in yl preserving the original.
	@ The bit position is stored in ip.

#if __LINUX_ARM_ARCH__ >= 5

	clz	yl, r4
	clz	ip, xh
	sub	yl, yl, ip
	mov	ip, #1
	mov	ip, ip, lsl yl
	mov	yl, r4, lsl yl

#else

	mov	yl, r4
	mov	ip, #1
1:	cmp	yl, #0x80000000
	cmpcc	yl, xh
	movcc	yl, yl, lsl #1
	movcc	ip, ip, lsl #1
	bcc	1b

#endif

	@ The division loop for needed upper bit positions.
 	@ Break out early if dividend reaches 0.
2:	cmp	xh, yl
	orrcs	yh, yh, ip
	subcss	xh, xh, yl
	movnes	ip, ip, lsr #1
	mov	yl, yl, lsr #1
	bne	2b

	@ See if we need to handle lower 32-bit result.
3:	cmp	xh, #0
	mov	yl, #0
	cmpeq	xl, r4
	movlo	xh, xl
	movlo	pc, lr

	@ The division loop for lower bit positions.
	@ Here we shift remainer bits leftwards rather than moving the
	@ divisor for comparisons, considering the carry-out bit as well.
	mov	ip, #0x80000000
4:	movs	xl, xl, lsl #1
	adcs	xh, xh, xh
	beq	6f
	cmpcc	xh, r4
5:	orrcs	yl, yl, ip
	subcs	xh, xh, r4
	movs	ip, ip, lsr #1
	bne	4b
	mov	pc, lr

	@ The top part of remainder became zero.  If carry is set
	@ (the 33th bit) this is a false positive so resume the loop.
	@ Otherwise, if lower part is also null then we are done.
6:	bcs	5b
	cmp	xl, #0
	moveq	pc, lr

	@ We still have remainer bits in the low part.  Bring them up.

#if __LINUX_ARM_ARCH__ >= 5

	clz	xh, xl			@ we know xh is zero here so...
	add	xh, xh, #1
	mov	xl, xl, lsl xh
	mov	ip, ip, lsr xh

#else

7:	movs	xl, xl, lsl #1
	mov	ip, ip, lsr #1
	bcc	7b

#endif

	@ Current remainder is now 1.  It is worthless to compare with
	@ divisor at this point since divisor can not be smaller than 3 here.
	@ If possible, branch for another shift in the division loop.
	@ If no bit position left then we are done.
	movs	ip, ip, lsr #1
	mov	xh, #1
	bne	4b
	mov	pc, lr

8:	@ Division by a power of 2: determine what that divisor order is
	@ then simply shift values around

#if __LINUX_ARM_ARCH__ >= 5

	clz	ip, r4
	rsb	ip, ip, #31

#else

	mov	yl, r4
	cmp	r4, #(1 << 16)
	mov	ip, #0
	movhs	yl, yl, lsr #16
	movhs	ip, #16

	cmp	yl, #(1 << 8)
	movhs	yl, yl, lsr #8
	addhs	ip, ip, #8

	cmp	yl, #(1 << 4)
	movhs	yl, yl, lsr #4
	addhs	ip, ip, #4

	cmp	yl, #(1 << 2)
	addhi	ip, ip, #3
	addls	ip, ip, yl, lsr #1

#endif

	mov	yh, xh, lsr ip
	mov	yl, xl, lsr ip
	rsb	ip, ip, #32
	orr	yl, yl, xh, lsl ip
	mov	xh, xl, lsl ip
	mov	xh, xh, lsr ip
	mov	pc, lr

	@ eq -> division by 1: obvious enough...
9:	moveq	yl, xl
	moveq	yh, xh
	moveq	xh, #0
	moveq	pc, lr

	@ Division by 0:
	str	lr, [sp, #-4]!
	bl	__div0

	@ as wrong as it could be...
	mov	yl, #0
	mov	yh, #0
	mov	xh, #0
	ldr	pc, [sp], #4