From: Nick Terrell <terrelln@xxxxxx> Subject: lz4: fix kernel decompression speed This patch replaces all memcpy() calls with LZ4_memcpy() which calls __builtin_memcpy() so the compiler can inline it. LZ4 relies heavily on memcpy() with a constant size being inlined. In x86 and i386 pre-boot environments memcpy() cannot be inlined because memcpy() doesn't get defined as __builtin_memcpy(). An equivalent patch has been applied upstream so that the next import won't lose this change [1]. I've measured the kernel decompression speed using QEMU before and after this patch for the x86_64 and i386 architectures. The speed-up is about 10x as shown below. Code Arch Kernel Size Time Speed v5.8 x86_64 11504832 B 148 ms 79 MB/s patch x86_64 11503872 B 13 ms 885 MB/s v5.8 i386 9621216 B 91 ms 106 MB/s patch i386 9620224 B 10 ms 962 MB/s I also measured the time to decompress the initramfs on x86_64, i386, and arm. All three show the same decompression speed before and after, as expected. [1] https://github.com/lz4/lz4/pull/890 Link: http://lkml.kernel.org/r/20200803194022.2966806-1-nickrterrell@xxxxxxxxx Signed-off-by: Nick Terrell <terrelln@xxxxxx> Cc: Yann Collet <yann.collet.73@xxxxxxxxx> Cc: Gao Xiang <gaoxiang25@xxxxxxxxxx> Cc: Sven Schmidt <4sschmid@xxxxxxxxxxxxxxxxxxxxxxxxx> Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Arvind Sankar <nivedita@xxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- lib/lz4/lz4_compress.c | 4 ++-- lib/lz4/lz4_decompress.c | 18 +++++++++--------- lib/lz4/lz4defs.h | 10 ++++++++++ lib/lz4/lz4hc_compress.c | 2 +- 4 files changed, 22 insertions(+), 12 deletions(-) --- a/lib/lz4/lz4_compress.c~lz4-fix-kernel-decompression-speed +++ a/lib/lz4/lz4_compress.c @@ -446,7 +446,7 @@ _last_literals: *op++ = (BYTE)(lastRun << ML_BITS); } - memcpy(op, anchor, lastRun); + LZ4_memcpy(op, anchor, lastRun); op += lastRun; } @@ -708,7 +708,7 @@ _last_literals: } else { *op++ = (BYTE)(lastRunSize<<ML_BITS); } - memcpy(op, anchor, lastRunSize); + LZ4_memcpy(op, anchor, lastRunSize); op += lastRunSize; } --- a/lib/lz4/lz4_decompress.c~lz4-fix-kernel-decompression-speed +++ a/lib/lz4/lz4_decompress.c @@ -153,7 +153,7 @@ static FORCE_INLINE int LZ4_decompress_g && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend))) { /* Copy the literals */ - memcpy(op, ip, endOnInput ? 16 : 8); + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); op += length; ip += length; /* @@ -172,9 +172,9 @@ static FORCE_INLINE int LZ4_decompress_g (offset >= 8) && (dict == withPrefix64k || match >= lowPrefix)) { /* Copy the match. */ - memcpy(op + 0, match + 0, 8); - memcpy(op + 8, match + 8, 8); - memcpy(op + 16, match + 16, 2); + LZ4_memcpy(op + 0, match + 0, 8); + LZ4_memcpy(op + 8, match + 8, 8); + LZ4_memcpy(op + 16, match + 16, 2); op += length + MINMATCH; /* Both stages worked, load the next token. */ continue; @@ -263,7 +263,7 @@ static FORCE_INLINE int LZ4_decompress_g } } - memcpy(op, ip, length); + LZ4_memcpy(op, ip, length); ip += length; op += length; @@ -350,7 +350,7 @@ _copy_match: size_t const copySize = (size_t)(lowPrefix - match); size_t const restSize = length - copySize; - memcpy(op, dictEnd - copySize, copySize); + LZ4_memcpy(op, dictEnd - copySize, copySize); op += copySize; if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ @@ -360,7 +360,7 @@ _copy_match: while (op < endOfMatch) *op++ = *copyFrom++; } else { - memcpy(op, lowPrefix, restSize); + LZ4_memcpy(op, lowPrefix, restSize); op += restSize; } } @@ -386,7 +386,7 @@ _copy_match: while (op < copyEnd) *op++ = *match++; } else { - memcpy(op, match, mlen); + LZ4_memcpy(op, match, mlen); } op = copyEnd; if (op == oend) @@ -400,7 +400,7 @@ _copy_match: op[2] = match[2]; op[3] = match[3]; match += inc32table[offset]; - memcpy(op + 4, match, 4); + LZ4_memcpy(op + 4, match, 4); match -= dec64table[offset]; } else { LZ4_copy8(op, match); --- a/lib/lz4/lz4defs.h~lz4-fix-kernel-decompression-speed +++ a/lib/lz4/lz4defs.h @@ -137,6 +137,16 @@ static FORCE_INLINE void LZ4_writeLE16(v return put_unaligned_le16(value, memPtr); } +/* + * LZ4 relies on memcpy with a constant size being inlined. In freestanding + * environments, the compiler can't assume the implementation of memcpy() is + * standard compliant, so apply its specialized memcpy() inlining logic. When + * possible, use __builtin_memcpy() to tell the compiler to analyze memcpy() + * as-if it were standard compliant, so it can inline it in freestanding + * environments. This is needed when decompressing the Linux Kernel, for example. + */ +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) + static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { #if LZ4_ARCH64 --- a/lib/lz4/lz4hc_compress.c~lz4-fix-kernel-decompression-speed +++ a/lib/lz4/lz4hc_compress.c @@ -570,7 +570,7 @@ _Search3: *op++ = (BYTE) lastRun; } else *op++ = (BYTE)(lastRun<<ML_BITS); - memcpy(op, anchor, iend - anchor); + LZ4_memcpy(op, anchor, iend - anchor); op += iend - anchor; } _