Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 11/06/2014 08:23 AM, Mans Rullgard wrote:
This is an adaptation of the optimised do_div() for ARM by
Nicolas Pitre implementing division by a constant using a
multiplication by the inverse.  Ideally, the compiler would
do this internally as it does for 32-bit operands, but it
doesn't.

This version of the code requires an assembler with support
for the DSP ASE syntax since accessing the hi/lo registers
sanely from inline asm is impossible without this.

It is easy to access hi/lo from inline asm. It is true that it is difficult to use them as input/output registers.

You should use MFHI/MFLO and to move the results to some general purpose registers. Then mention "hi", "lo" in the clobbers statement for the inline asm.

FWIW, it seems like you are missing the clobbers in your inline asm below.


 Building
for a CPU without this extension still works, however.

It also does not protect against the WAR hazards for the
hi/lo registers present on CPUs prior to MIPS IV.

I have only tested it as far as booting and light use with
the BUG_ON enabled wihtout encountering any issues.

The inverse computation code is a straight copy from ARM,
so this should probably be moved to a shared location.

Signed-off-by: Mans Rullgard <mans@xxxxxxxxx>
---
  arch/mips/include/asm/div64.h | 178 +++++++++++++++++++++++++++++++++++++++++-
  1 file changed, 174 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/div64.h b/arch/mips/include/asm/div64.h
index dc5ea57..bcebfb1 100644
--- a/arch/mips/include/asm/div64.h
+++ b/arch/mips/include/asm/div64.h
@@ -9,12 +9,12 @@
  #ifndef __ASM_DIV64_H
  #define __ASM_DIV64_H

-#include <asm-generic/div64.h>
-
-#if BITS_PER_LONG == 64
-
  #include <linux/types.h>

+#if BITS_PER_LONG == 64
+
+#include <asm-generic/div64.h>
+
  /*
   * No traps on overflows for any of these...
   */
@@ -63,6 +63,176 @@
  	__mod32;							\
  })

+#elif __GNUC__ >= 4
+
+#include <asm/bug.h>
+
+extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+
+/*
+ * If the divisor happens to be constant, we determine the appropriate
+ * inverse at compile time to turn the division into a few inline
+ * multiplications instead which is much faster.
+ * (It is unfortunate that gcc doesn't perform all this internally.)
+ */
+#define do_div(n, base)							\
+({									\
+	unsigned int __r, __b = (base);					\
+	if (likely(((n) >> 32) == 0)) {					\
+		__r = (u32)(n) % __b;					\
+		(n) = (u32)(n) / __b;					\
+	} else if (!__builtin_constant_p(__b) || __b == 0) {		\
+		/* non-constant divisor (or zero): slow path */		\
+		__r = __div64_32(&(n), __b);				\
+	} else if ((__b & (__b - 1)) == 0) {				\
+		/* Trivial: __b is constant and a power of 2 */		\
+		/* gcc does the right thing with this code.  */		\
+		__r = n;						\
+		__r &= (__b - 1);					\
+		n /= __b;						\
+	} else {							\
+		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
+		/* We rely on the fact that most of this code gets   */	\
+		/* optimized away at compile time due to constant    */	\
+		/* propagation and only a couple inline assembly     */	\
+		/* instructions should remain. Better avoid any      */	\
+		/* code construct that might prevent that.           */	\
+		unsigned long long __res, __x, __t, __m, __n = n;	\
+		unsigned int __c, __p, __z = 0;				\
+		/* preserve low part of n for reminder computation */	\
+		__r = __n;						\
+		/* determine number of bits to represent __b */		\
+		__p = 1 << __div64_fls(__b);				\
+		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
+		__m = (~0ULL / __b) * __p;				\
+		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
+		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
+		__x = ~0ULL / __b * __b - 1;				\
+		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
+		__res >>= 32;						\
+		__res += (__m & 0xffffffff) * (__x >> 32);		\
+		__t = __res;						\
+		__res += (__x & 0xffffffff) * (__m >> 32);		\
+		__t = (__res < __t) ? (1ULL << 32) : 0;			\
+		__res = (__res >> 32) + __t;				\
+		__res += (__m >> 32) * (__x >> 32);			\
+		__res /= __p;						\
+		/* Now sanitize and optimize what we've got. */		\
+		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
+			/* those cases can be simplified with: */	\
+			__n /= (__b & -__b);				\
+			__m = ~0ULL / (__b / (__b & -__b));		\
+			__p = 1;					\
+			__c = 1;					\
+		} else if (__res != __x / __b) {			\
+			/* We can't get away without a correction    */	\
+			/* to compensate for bit truncation errors.  */	\
+			/* To avoid it we'd need an additional bit   */	\
+			/* to represent __m which would overflow it. */	\
+			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
+			__c = 1;					\
+			/* Compute __m = (__p << 64) / __b */		\
+			__m = (~0ULL / __b) * __p;			\
+			__m += ((~0ULL % __b + 1) * __p) / __b;		\
+		} else {						\
+			/* Reduce __m/__p, and try to clear bit 31   */	\
+			/* of __m when possible otherwise that'll    */	\
+			/* need extra overflow handling later.       */	\
+			unsigned int __bits = -(__m & -__m);		\
+			__bits |= __m >> 32;				\
+			__bits = (~__bits) << 1;			\
+			/* If __bits == 0 then setting bit 31 is     */	\
+			/* unavoidable.  Simply apply the maximum    */	\
+			/* possible reduction in that case.          */	\
+			/* Otherwise the MSB of __bits indicates the */	\
+			/* best reduction we should apply.           */	\
+			if (!__bits) {					\
+				__p /= (__m & -__m);			\
+				__m /= (__m & -__m);			\
+			} else {					\
+				__p >>= __div64_fls(__bits);		\
+				__m >>= __div64_fls(__bits);		\
+			}						\
+			/* No correction needed. */			\
+			__c = 0;					\
+		}							\
+		/* Now we have a combination of 2 conditions:        */	\
+		/* 1) whether or not we need a correction (__c), and */	\
+		/* 2) whether or not there might be an overflow in   */	\
+		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
+		/* Select the best insn combination to perform the   */	\
+		/* actual __m * __n / (__p << 64) operation.         */	\
+		if (!__c) {						\
+			__res = (u64)(u32)__m * (u32)__n >> 32;		\
+		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
+			__res = (__m + (u64)(u32)__m * (u32)__n) >> 32;	\
+		} else {						\
+			__t = __m;					\
+			asm (	".set	push		\n"		\
+				".set	dsp		\n"		\
+				"maddu	%q2, %L3, %L4	\n"             \
+                                "mflo	%L0, %q2	\n"             \
+                                "mfhi	%M0, %q2	\n"             \
+				"sltu	%1,  %L0, %L3	\n"		\
+				"addu	%L0, %M0, %1	\n"		\
+				"sltu	%M0, %L0, %M3	\n"		\
+				".set	pop		\n"		\
+				: "=&r"(__res), "=&r"(__c), "+&ka"(__t) \
+				: "r"(__m), "r"(__n));			\
+		}							\
+		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
+			asm (	".set	push		\n"		\
+				".set	dsp		\n"		\
+				"maddu	%q0, %M2, %L3	\n"		\
+				"maddu	%q0, %L2, %M3	\n"		\
+				"mfhi	%1,  %q0	\n"		\
+				"mthi	$0,  %q0	\n"		\
+				"mtlo	%1,  %q0	\n"		\
+				"maddu	%q0, %M2, %M3	\n"		\
+				".set	pop		\n"		\
+				: "+&ka"(__res), "=&r"(__c)		\
+				: "r"(__m), "r"(__n));			\
+		} else {						\
+			asm (	".set	push		\n"		\
+				".set	dsp		\n"		\
+				"maddu	%q0, %M3, %L4	\n"		\
+				"mfhi	%2,  %q0	\n"		\
+				"maddu	%q0, %L3, %M4	\n"		\
+				"mfhi	%1,  %q0	\n"		\
+				"sltu	%2,  %1,  %2	\n"		\
+				"mtlo	%1,  %q0	\n"		\
+				"mthi	%2,  %q0	\n"		\
+				"maddu	%q0, %M3, %M4	\n"		\
+				".set	pop		\n"		\
+				: "+&ka"(__res), "=&r"(__z), "=&r"(__c) \
+				: "r"(__m), "r"(__n));			\
+		}							\
+		__res /= __p;						\
+		/* The reminder can be computed with 32-bit regs     */	\
+		/* only, and gcc is good at that.                    */	\
+		{							\
+			unsigned int __res0 = __res;			\
+			unsigned int __b0 = __b;			\
+			__r -= __res0 * __b0;				\
+		}							\
+		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
+		n = __res;						\
+	}								\
+	__r;								\
+})
+
+/* our own fls implementation to make sure constant propagation is fine */
+#define __div64_fls(bits)						\
+({									\
+	unsigned int __left = (bits), __nr = 0;				\
+	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
+	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
+	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
+	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
+	if (__left & 0x00000002) __nr +=  1;				\
+	__nr;								\
+})
+
  #endif /* BITS_PER_LONG == 64 */

  #endif /* __ASM_DIV64_H */







[Index of Archives]     [Linux MIPS Home]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Linux]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux