[PATCH 14/15] print_integer, printf: rewrite the rest of lib/vsprintf.c via print_integer()

Alexey Dobriyan <adobriyan@xxxxxxxxx> · Mon, 20 Apr 2020 23:57:42 +0300

Lookup tables are cheating.

Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx>
---
 lib/vsprintf.c | 236 +++++--------------------------------------------
 1 file changed, 20 insertions(+), 216 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index df2b5ce08fe9..b29fbd0da53f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -138,204 +138,6 @@ int skip_atoi(const char **s)
 	return i;
 }
 
-/*
- * Decimal conversion is by far the most typical, and is used for
- * /proc and /sys data. This directly impacts e.g. top performance
- * with many processes running. We optimize it for speed by emitting
- * two characters at a time, using a 200 byte lookup table. This
- * roughly halves the number of multiplications compared to computing
- * the digits one at a time. Implementation strongly inspired by the
- * previous version, which in turn used ideas described at
- * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
- * from the author, Douglas W. Jones).
- *
- * It turns out there is precisely one 26 bit fixed-point
- * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
- * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
- * range happens to be somewhat larger (x <= 1073741898), but that's
- * irrelevant for our purpose.
- *
- * For dividing a number in the range [10^4, 10^6-1] by 100, we still
- * need a 32x32->64 bit multiply, so we simply use the same constant.
- *
- * For dividing a number in the range [100, 10^4-1] by 100, there are
- * several options. The simplest is (x * 0x147b) >> 19, which is valid
- * for all x <= 43698.
- */
-
-static const u16 decpair[100] = {
-#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
-	_( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
-	_(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
-	_(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
-	_(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
-	_(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
-	_(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
-	_(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
-	_(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
-	_(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
-	_(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
-#undef _
-};
-
-/*
- * This will print a single '0' even if r == 0, since we would
- * immediately jump to out_r where two 0s would be written but only
- * one of them accounted for in buf. This is needed by ip4_string
- * below. All other callers pass a non-zero value of r.
-*/
-static noinline_for_stack
-char *put_dec_trunc8(char *buf, unsigned r)
-{
-	unsigned q;
-
-	/* 1 <= r < 10^8 */
-	if (r < 100)
-		goto out_r;
-
-	/* 100 <= r < 10^8 */
-	q = (r * (u64)0x28f5c29) >> 32;
-	*((u16 *)buf) = decpair[r - 100*q];
-	buf += 2;
-
-	/* 1 <= q < 10^6 */
-	if (q < 100)
-		goto out_q;
-
-	/*  100 <= q < 10^6 */
-	r = (q * (u64)0x28f5c29) >> 32;
-	*((u16 *)buf) = decpair[q - 100*r];
-	buf += 2;
-
-	/* 1 <= r < 10^4 */
-	if (r < 100)
-		goto out_r;
-
-	/* 100 <= r < 10^4 */
-	q = (r * 0x147b) >> 19;
-	*((u16 *)buf) = decpair[r - 100*q];
-	buf += 2;
-out_q:
-	/* 1 <= q < 100 */
-	r = q;
-out_r:
-	/* 1 <= r < 100 */
-	*((u16 *)buf) = decpair[r];
-	buf += r < 10 ? 1 : 2;
-	return buf;
-}
-
-#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
-static noinline_for_stack
-char *put_dec_full8(char *buf, unsigned r)
-{
-	unsigned q;
-
-	/* 0 <= r < 10^8 */
-	q = (r * (u64)0x28f5c29) >> 32;
-	*((u16 *)buf) = decpair[r - 100*q];
-	buf += 2;
-
-	/* 0 <= q < 10^6 */
-	r = (q * (u64)0x28f5c29) >> 32;
-	*((u16 *)buf) = decpair[q - 100*r];
-	buf += 2;
-
-	/* 0 <= r < 10^4 */
-	q = (r * 0x147b) >> 19;
-	*((u16 *)buf) = decpair[r - 100*q];
-	buf += 2;
-
-	/* 0 <= q < 100 */
-	*((u16 *)buf) = decpair[q];
-	buf += 2;
-	return buf;
-}
-
-static noinline_for_stack
-char *put_dec(char *buf, unsigned long long n)
-{
-	if (n >= 100*1000*1000)
-		buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
-	/* 1 <= n <= 1.6e11 */
-	if (n >= 100*1000*1000)
-		buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
-	/* 1 <= n < 1e8 */
-	return put_dec_trunc8(buf, n);
-}
-
-#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
-
-static void
-put_dec_full4(char *buf, unsigned r)
-{
-	unsigned q;
-
-	/* 0 <= r < 10^4 */
-	q = (r * 0x147b) >> 19;
-	*((u16 *)buf) = decpair[r - 100*q];
-	buf += 2;
-	/* 0 <= q < 100 */
-	*((u16 *)buf) = decpair[q];
-}
-
-/*
- * Call put_dec_full4 on x % 10000, return x / 10000.
- * The approximation x/10000 == (x * 0x346DC5D7) >> 43
- * holds for all x < 1,128,869,999.  The largest value this
- * helper will ever be asked to convert is 1,125,520,955.
- * (second call in the put_dec code, assuming n is all-ones).
- */
-static noinline_for_stack
-unsigned put_dec_helper4(char *buf, unsigned x)
-{
-        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
-
-        put_dec_full4(buf, x - q * 10000);
-        return q;
-}
-
-/* Based on code by Douglas W. Jones found at
- * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
- * (with permission from the author).
- * Performs no 64-bit division and hence should be fast on 32-bit machines.
- */
-static
-char *put_dec(char *buf, unsigned long long n)
-{
-	uint32_t d3, d2, d1, q, h;
-
-	if (n < 100*1000*1000)
-		return put_dec_trunc8(buf, n);
-
-	d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
-	h   = (n >> 32);
-	d2  = (h      ) & 0xffff;
-	d3  = (h >> 16); /* implicit "& 0xffff" */
-
-	/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
-	     = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
-	q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
-	q = put_dec_helper4(buf, q);
-
-	q += 7671 * d3 + 9496 * d2 + 6 * d1;
-	q = put_dec_helper4(buf+4, q);
-
-	q += 4749 * d3 + 42 * d2;
-	q = put_dec_helper4(buf+8, q);
-
-	q += 281 * d3;
-	buf += 12;
-	if (q)
-		buf = put_dec_trunc8(buf, q);
-	else while (buf[-1] == '0')
-		--buf;
-
-	return buf;
-}
-
-#endif
-
 /*
  * Convert passed number to decimal string.
  * Returns the length of string.  On buffer overflow, returns 0.
@@ -410,8 +212,6 @@ static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
 	     struct printf_spec spec)
 {
-	/* put_dec requires 2-byte alignment of the buffer. */
-	char tmp[3 * sizeof(num)] __aligned(2);
 	char sign;
 	char locase;
 	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -419,6 +219,8 @@ char *number(char *buf, char *end, unsigned long long num,
 	bool is_zero = num == 0LL;
 	int field_width = spec.field_width;
 	int precision = spec.precision;
+	char tmp[22];
+	char *p = tmp + sizeof(tmp);
 
 	/* locase = 0 or 0x20. ORing digits or letters with 'locase'
 	 * produces same digits or (maybe lowercased) letters */
@@ -446,10 +248,9 @@ char *number(char *buf, char *end, unsigned long long num,
 			field_width--;
 	}
 
-	/* generate full string in tmp[], in reverse order */
-	i = 0;
+	/* generate full string in tmp[] */
 	if (num < spec.base)
-		tmp[i++] = hex_asc_upper[num] | locase;
+		*--p = hex_asc_upper[num] | locase;
 	else if (spec.base != 10) { /* 8 or 16 */
 		int mask = spec.base - 1;
 		int shift = 3;
@@ -457,12 +258,13 @@ char *number(char *buf, char *end, unsigned long long num,
 		if (spec.base == 16)
 			shift = 4;
 		do {
-			tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
+			*--p = hex_asc_upper[((unsigned char)num) & mask] | locase;
 			num >>= shift;
 		} while (num);
 	} else { /* base 10 */
-		i = put_dec(tmp, num) - tmp;
+		p = _print_integer_u64(p, num);
 	}
+	i = tmp + sizeof(tmp) - p;
 
 	/* printing 100 using %2d gives "100", not "00" */
 	if (i > precision)
@@ -512,10 +314,11 @@ char *number(char *buf, char *end, unsigned long long num,
 		++buf;
 	}
 	/* actual digits of result */
-	while (--i >= 0) {
+	while (p != tmp + sizeof(tmp)) {
 		if (buf < end)
-			*buf = tmp[i];
+			*buf = *p;
 		++buf;
+		p++;
 	}
 	/* trailing space padding */
 	while (--field_width >= 0) {
@@ -1297,17 +1100,18 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
 		break;
 	}
 	for (i = 0; i < 4; i++) {
-		char temp[4] __aligned(2);	/* hold each IP quad in reverse order */
-		int digits = put_dec_trunc8(temp, addr[index]) - temp;
+		char tmp[3];
+		char *q = tmp + sizeof(tmp);
+
+		tmp[0] = tmp[1] = '0';
+		q = _print_integer_u32(q, addr[index]);
 		if (leading_zeros) {
-			if (digits < 3)
-				*p++ = '0';
-			if (digits < 2)
-				*p++ = '0';
+			q = tmp;
 		}
-		/* reverse the digits in the quad */
-		while (digits--)
-			*p++ = temp[digits];
+		do {
+			*p++ = *q++;
+		} while (q != tmp + sizeof(tmp));
+
 		if (i < 3)
 			*p++ = '.';
 		index += step;
-- 
2.24.1