Time honored way to print integers via vsnprintf() or equivalent has unavoidable slowdown of parsing format string. This can't be fixed in C, without introducing external preprocessor. seq_put_decimal_ull() partially saves the day, but there are a lot of branches inside and overcopying still. _print_integer_*() family of functions is meant to make printing integers as fast as possible by deleting format string parsing and doing as little work as possible. It is based on the following observations: 1) memcpy is done in forward direction it can be done backwards but nobody does that, 2) digits can be extracted in a very simple loop which costs only 1 multiplication and shift (division by constant is not division) All the above asks for the following signature, semantics and pattern of printing out beloved /proc files: /* seq_printf(seq, "%u %llu\n", A, b); */ char buf[10 + 1 + 20 + 1]; char *p = buf + sizeof(buf); *--p = '\n'; p = _print_integer_u64(p, B); *--p = ' '; p = _print_integer_u32(p, A); seq_write(seq, p, buf + sizeof(buf) - p); 1) stack buffer capable of holding the biggest string is allocated. 2) "p" is pointer to start of the string. Initially it points past the end of the buffer WHICH IS NOT NUL-TERMINATED! 3) _print_integer_*() actually prints an integer from right to left and returns new start of the string. <--------| 123 ^ | +-- p 4) 1 character is printed with *--p = 'x'; It generates very efficient code as multiple writes can be merged. 5) fixed string is printed with p = memcpy(p - 3, "foo", 3); Complers know what memcpy() does and write-combine it. 4/8-byte writes become 1 instruction and are very efficient. 6) Once everything is printed, the result is written to seq_file buffer. It does only one overflow check and 1 copy. This generates very efficient code (and small!). In regular seq_printf() calls, first argument and format string are constantly reloaded. Format string will most likely with [rip+...] which is quite verbose. seq_put_decimal_ull() will do branches (and even more branches with "width" argument) TODO benchmark with mainline because nouveau is broken for me -( vsnprintf() changes make the code slower Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx> --- MAINTAINERS | 6 ++++++ lib/Makefile | 2 +- lib/print-integer.c | 40 ++++++++++++++++++++++++++++++++++++++++ lib/print-integer.h | 20 ++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 lib/print-integer.c create mode 100644 lib/print-integer.h diff --git a/MAINTAINERS b/MAINTAINERS index b816a453b10e..8322125bb929 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8470,6 +8470,12 @@ L: linux-crypto@xxxxxxxxxxxxxxx S: Maintained F: drivers/crypto/inside-secure/ +INTEGER PRINTING PRESS +M: Alexey Dobriyan <adobriyan@xxxxxxxxx> +L: linux-kernel@xxxxxxxxxxxxxxx +F: lib/print-integer.[ch] +S: Maintained + INTEGRITY MEASUREMENT ARCHITECTURE (IMA) M: Mimi Zohar <zohar@xxxxxxxxxxxxx> M: Dmitry Kasatkin <dmitry.kasatkin@xxxxxxxxx> diff --git a/lib/Makefile b/lib/Makefile index 685aee60de1d..a2f011fa6739 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,7 +25,7 @@ KASAN_SANITIZE_string.o := n CFLAGS_string.o := $(call cc-option, -fno-stack-protector) endif -lib-y := ctype.o string.o vsprintf.o cmdline.o \ +lib-y := ctype.o string.o print-integer.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ idr.o extable.o sha1.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ diff --git a/lib/print-integer.c b/lib/print-integer.c new file mode 100644 index 000000000000..563aaca19b8c --- /dev/null +++ b/lib/print-integer.c @@ -0,0 +1,40 @@ +#include <linux/compiler.h> +#include <linux/math64.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "print-integer.h" + +noinline +char *_print_integer_u32(char *p, u32 x) +{ + do { + *--p = '0' + (x % 10); + } while (x /= 10); + return p; +} + +noinline +char *_print_integer_s32(char *p, s32 x) +{ + if (x < 0) { + p = _print_integer_u32(p, -x); + *--p = '-'; + return p; + } else { + return _print_integer_u32(p, x); + } +} + +noinline +char *_print_integer_u64(char *p, u64 x) +{ + while (x >= 100 * 1000 * 1000) { + u32 r; + + x = div_u64_rem(x, 100 * 1000 * 1000, &r); + p = memset(p - 8, '0', 8); + (void)_print_integer_u32(p + 8, r); + } + return _print_integer_u32(p, x); +} diff --git a/lib/print-integer.h b/lib/print-integer.h new file mode 100644 index 000000000000..a6f8e1757a6f --- /dev/null +++ b/lib/print-integer.h @@ -0,0 +1,20 @@ +#pragma once +char *_print_integer_u32(char *, u32); +char *_print_integer_u64(char *, u64); +char *_print_integer_s32(char *, s32); + +static inline char *_print_integer_ul(char *p, unsigned long x) +{ +#ifdef CONFIG_64BIT + return _print_integer_u64(p, x); +#else + return _print_integer_u32(p, x); +#endif +} + +enum { + LEN_U32 = 10, + LEN_S32 = 1 + LEN_U32, + LEN_UL = sizeof(long) * 5 / 2, + LEN_U64 = 20, +}; -- 2.24.1