Lookup tables are cheating. Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx> --- lib/vsprintf.c | 236 +++++-------------------------------------------- 1 file changed, 20 insertions(+), 216 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index df2b5ce08fe9..b29fbd0da53f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -138,204 +138,6 @@ int skip_atoi(const char **s) return i; } -/* - * Decimal conversion is by far the most typical, and is used for - * /proc and /sys data. This directly impacts e.g. top performance - * with many processes running. We optimize it for speed by emitting - * two characters at a time, using a 200 byte lookup table. This - * roughly halves the number of multiplications compared to computing - * the digits one at a time. Implementation strongly inspired by the - * previous version, which in turn used ideas described at - * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission - * from the author, Douglas W. Jones). - * - * It turns out there is precisely one 26 bit fixed-point - * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 - * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual - * range happens to be somewhat larger (x <= 1073741898), but that's - * irrelevant for our purpose. - * - * For dividing a number in the range [10^4, 10^6-1] by 100, we still - * need a 32x32->64 bit multiply, so we simply use the same constant. - * - * For dividing a number in the range [100, 10^4-1] by 100, there are - * several options. The simplest is (x * 0x147b) >> 19, which is valid - * for all x <= 43698. - */ - -static const u16 decpair[100] = { -#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) - _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), - _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), - _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), - _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), - _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), - _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), - _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), - _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), - _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), - _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), -#undef _ -}; - -/* - * This will print a single '0' even if r == 0, since we would - * immediately jump to out_r where two 0s would be written but only - * one of them accounted for in buf. This is needed by ip4_string - * below. All other callers pass a non-zero value of r. -*/ -static noinline_for_stack -char *put_dec_trunc8(char *buf, unsigned r) -{ - unsigned q; - - /* 1 <= r < 10^8 */ - if (r < 100) - goto out_r; - - /* 100 <= r < 10^8 */ - q = (r * (u64)0x28f5c29) >> 32; - *((u16 *)buf) = decpair[r - 100*q]; - buf += 2; - - /* 1 <= q < 10^6 */ - if (q < 100) - goto out_q; - - /* 100 <= q < 10^6 */ - r = (q * (u64)0x28f5c29) >> 32; - *((u16 *)buf) = decpair[q - 100*r]; - buf += 2; - - /* 1 <= r < 10^4 */ - if (r < 100) - goto out_r; - - /* 100 <= r < 10^4 */ - q = (r * 0x147b) >> 19; - *((u16 *)buf) = decpair[r - 100*q]; - buf += 2; -out_q: - /* 1 <= q < 100 */ - r = q; -out_r: - /* 1 <= r < 100 */ - *((u16 *)buf) = decpair[r]; - buf += r < 10 ? 1 : 2; - return buf; -} - -#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 -static noinline_for_stack -char *put_dec_full8(char *buf, unsigned r) -{ - unsigned q; - - /* 0 <= r < 10^8 */ - q = (r * (u64)0x28f5c29) >> 32; - *((u16 *)buf) = decpair[r - 100*q]; - buf += 2; - - /* 0 <= q < 10^6 */ - r = (q * (u64)0x28f5c29) >> 32; - *((u16 *)buf) = decpair[q - 100*r]; - buf += 2; - - /* 0 <= r < 10^4 */ - q = (r * 0x147b) >> 19; - *((u16 *)buf) = decpair[r - 100*q]; - buf += 2; - - /* 0 <= q < 100 */ - *((u16 *)buf) = decpair[q]; - buf += 2; - return buf; -} - -static noinline_for_stack -char *put_dec(char *buf, unsigned long long n) -{ - if (n >= 100*1000*1000) - buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); - /* 1 <= n <= 1.6e11 */ - if (n >= 100*1000*1000) - buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); - /* 1 <= n < 1e8 */ - return put_dec_trunc8(buf, n); -} - -#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 - -static void -put_dec_full4(char *buf, unsigned r) -{ - unsigned q; - - /* 0 <= r < 10^4 */ - q = (r * 0x147b) >> 19; - *((u16 *)buf) = decpair[r - 100*q]; - buf += 2; - /* 0 <= q < 100 */ - *((u16 *)buf) = decpair[q]; -} - -/* - * Call put_dec_full4 on x % 10000, return x / 10000. - * The approximation x/10000 == (x * 0x346DC5D7) >> 43 - * holds for all x < 1,128,869,999. The largest value this - * helper will ever be asked to convert is 1,125,520,955. - * (second call in the put_dec code, assuming n is all-ones). - */ -static noinline_for_stack -unsigned put_dec_helper4(char *buf, unsigned x) -{ - uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; - - put_dec_full4(buf, x - q * 10000); - return q; -} - -/* Based on code by Douglas W. Jones found at - * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> - * (with permission from the author). - * Performs no 64-bit division and hence should be fast on 32-bit machines. - */ -static -char *put_dec(char *buf, unsigned long long n) -{ - uint32_t d3, d2, d1, q, h; - - if (n < 100*1000*1000) - return put_dec_trunc8(buf, n); - - d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ - h = (n >> 32); - d2 = (h ) & 0xffff; - d3 = (h >> 16); /* implicit "& 0xffff" */ - - /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 - = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ - q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); - q = put_dec_helper4(buf, q); - - q += 7671 * d3 + 9496 * d2 + 6 * d1; - q = put_dec_helper4(buf+4, q); - - q += 4749 * d3 + 42 * d2; - q = put_dec_helper4(buf+8, q); - - q += 281 * d3; - buf += 12; - if (q) - buf = put_dec_trunc8(buf, q); - else while (buf[-1] == '0') - --buf; - - return buf; -} - -#endif - /* * Convert passed number to decimal string. * Returns the length of string. On buffer overflow, returns 0. @@ -410,8 +212,6 @@ static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { - /* put_dec requires 2-byte alignment of the buffer. */ - char tmp[3 * sizeof(num)] __aligned(2); char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); @@ -419,6 +219,8 @@ char *number(char *buf, char *end, unsigned long long num, bool is_zero = num == 0LL; int field_width = spec.field_width; int precision = spec.precision; + char tmp[22]; + char *p = tmp + sizeof(tmp); /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ @@ -446,10 +248,9 @@ char *number(char *buf, char *end, unsigned long long num, field_width--; } - /* generate full string in tmp[], in reverse order */ - i = 0; + /* generate full string in tmp[] */ if (num < spec.base) - tmp[i++] = hex_asc_upper[num] | locase; + *--p = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; @@ -457,12 +258,13 @@ char *number(char *buf, char *end, unsigned long long num, if (spec.base == 16) shift = 4; do { - tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); + *--p = hex_asc_upper[((unsigned char)num) & mask] | locase; num >>= shift; } while (num); } else { /* base 10 */ - i = put_dec(tmp, num) - tmp; + p = _print_integer_u64(p, num); } + i = tmp + sizeof(tmp) - p; /* printing 100 using %2d gives "100", not "00" */ if (i > precision) @@ -512,10 +314,11 @@ char *number(char *buf, char *end, unsigned long long num, ++buf; } /* actual digits of result */ - while (--i >= 0) { + while (p != tmp + sizeof(tmp)) { if (buf < end) - *buf = tmp[i]; + *buf = *p; ++buf; + p++; } /* trailing space padding */ while (--field_width >= 0) { @@ -1297,17 +1100,18 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt) break; } for (i = 0; i < 4; i++) { - char temp[4] __aligned(2); /* hold each IP quad in reverse order */ - int digits = put_dec_trunc8(temp, addr[index]) - temp; + char tmp[3]; + char *q = tmp + sizeof(tmp); + + tmp[0] = tmp[1] = '0'; + q = _print_integer_u32(q, addr[index]); if (leading_zeros) { - if (digits < 3) - *p++ = '0'; - if (digits < 2) - *p++ = '0'; + q = tmp; } - /* reverse the digits in the quad */ - while (digits--) - *p++ = temp[digits]; + do { + *p++ = *q++; + } while (q != tmp + sizeof(tmp)); + if (i < 3) *p++ = '.'; index += step; -- 2.24.1