Hi Sven, On Sun, Feb 12, 2017 at 12:16:18PM +0100, Sven Schmidt wrote: > /*-************************************ > * Reading and writing into memory > **************************************/ > +typedef union { > + U16 u16; > + U32 u32; > + size_t uArch; > +} __packed unalign; > > -static inline U16 LZ4_read16(const void *memPtr) > +static FORCE_INLINE __maybe_unused U16 LZ4_read16(const void *ptr) > { > - U16 val; > - > - memcpy(&val, memPtr, sizeof(val)); > - > - return val; > + return ((const unalign *)ptr)->u16; > } > > -static inline U32 LZ4_read32(const void *memPtr) > +static FORCE_INLINE __maybe_unused U32 LZ4_read32(const void *ptr) > { > - U32 val; > - > - memcpy(&val, memPtr, sizeof(val)); > - > - return val; > + return ((const unalign *)ptr)->u32; > } > > -static inline size_t LZ4_read_ARCH(const void *memPtr) > +static FORCE_INLINE __maybe_unused size_t LZ4_read_ARCH(const void *ptr) > { > - size_t val; > - > - memcpy(&val, memPtr, sizeof(val)); > - > - return val; > + return ((const unalign *)ptr)->uArch; > } > > -static inline void LZ4_write16(void *memPtr, U16 value) > +static FORCE_INLINE __maybe_unused void LZ4_write16(void *memPtr, U16 value) > { > - memcpy(memPtr, &value, sizeof(value)); > + ((unalign *)memPtr)->u16 = value; > } > > -static inline void LZ4_write32(void *memPtr, U32 value) > -{ > - memcpy(memPtr, &value, sizeof(value)); > +static FORCE_INLINE __maybe_unused void LZ4_write32(void *memPtr, U32 value) { > + ((unalign *)memPtr)->u32 = value; > } > > -static inline U16 LZ4_readLE16(const void *memPtr) > +static FORCE_INLINE __maybe_unused U16 LZ4_readLE16(const void *memPtr) > { > -#ifdef __LITTLE_ENDIAN__ > +#if LZ4_LITTLE_ENDIAN > return LZ4_read16(memPtr); > #else > const BYTE *p = (const BYTE *)memPtr; > @@ -137,19 +143,19 @@ static inline U16 LZ4_readLE16(const void *memPtr) > #endif > } Since upstream LZ4 is intended to be compiled at -O3, this may allow it to get away with using memcpy() for unaligned memory accesses. The reason it uses memcpy() is that, other than a byte-by-byte copy, it is the only portable way to express unaligned memory accesses. But the Linux kernel is sometimes compiled optimized for size (-Os), and I wouldn't be *too* surprised if some of the memcpy()'s don't always get inlined then, which could be causing the performance regression being observed. (Of course, this could be verified by checking whether CONFIG_CC_OPTIMIZE_FOR_SIZE=y is set, then reading the assembly.) But I don't think accessing a __packed structure directly is the right alternative. Instead, Linux already includes macros for unaligned memory accesses which have been optimized for every supported architecture. Those should just be used instead, e.g. like this: static FORCE_INLINE U16 LZ4_read16(const void *ptr) { return get_unaligned((const u16 *)ptr); } static FORCE_INLINE U32 LZ4_read32(const void *ptr) { return get_unaligned((const u32 *)ptr); } static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) { return get_unaligned((const size_t *)ptr); } static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) { put_unaligned(value, (u16 *)memPtr); } static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) { put_unaligned(value, (u32 *)memPtr); } static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); } static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) { return put_unaligned_le16(value, memPtr); } static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { if (LZ4_64bits()) { u64 a = get_unaligned((const u64 *)src); put_unaligned(a, (u64 *)dst); } else { u32 a = get_unaligned((const u32 *)src); u32 b = get_unaligned((const u32 *)src + 1); put_unaligned(a, (u32 *)dst); put_unaligned(b, (u32 *)dst + 1); } } Note that I dropped __maybe_unused as it's not needed on inline functions. That should be done everywhere else the patch proposes to add it too. > -#if LZ4_ARCH64 > -#ifdef __BIG_ENDIAN__ > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) > +static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) > +{ > +#if LZ4_LITTLE_ENDIAN > +#if LZ4_ARCH64 /* 64 Bits Little Endian */ > +#if defined(LZ4_FORCE_SW_BITCOUNT) > + static const int DeBruijnBytePos[64] = { > + 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, > + 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, > + 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, > + 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 > + }; > + > + return DeBruijnBytePos[((U64)((val & -(long long)val) > + * 0x0218A392CDABBD3FULL)) >> 58]; > #else > -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) > -#endif > + return (__builtin_ctzll((U64)val) >> 3); > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */ > +#else /* 32 Bits Little Endian */ > +#if defined(LZ4_FORCE_SW_BITCOUNT) > + static const int DeBruijnBytePos[32] = { > + 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, > + 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 > + }; > + > + return DeBruijnBytePos[((U32)((val & -(S32)val) > + * 0x077CB531U)) >> 27]; > #else > -#ifdef __BIG_ENDIAN__ > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) > + return (__builtin_ctz((U32)val) >> 3); > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */ > +#endif /* LZ4_ARCH64 */ > +#else /* Big Endian */ > +#if LZ4_ARCH64 /* 64 Bits Big Endian */ > +#if defined(LZ4_FORCE_SW_BITCOUNT) > + unsigned int r; > + > + if (!(val >> 32)) { > + r = 4; > + } else { > + r = 0; > + val >>= 32; > + } > + > + if (!(val >> 16)) { > + r += 2; > + val >>= 8; > + } else { > + val >>= 24; > + } > + > + r += (!val); > + > + return r; > #else > -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) > -#endif > -#endif > + return (__builtin_clzll((U64)val) >> 3); > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */ > +#else /* 32 Bits Big Endian */ > +#if defined(LZ4_FORCE_SW_BITCOUNT) > + unsigned int r; > + > + if (!(val >> 16)) { > + r = 2; > + val >>= 8; > + } else { > + r = 0; > + val >>= 24; > + } > + > + r += (!val); > + > + return r; > +#else > + return (__builtin_clz((U32)val) >> 3); > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */ > +#endif /* LZ4_ARCH64 */ > +#endif /* LZ4_LITTLE_ENDIAN */ > +} The reason LZ4_NbCommonBytes() in upstream LZ4 is so complicated is that it needs to provide portable fallbacks that work on *any* platform and compiler. This isn't needed in the Linux kernel, and it should just call the functions already defined that do the right thing: static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) { if (LZ4_isLittleEndian()) return __ffs(val) >> 3; else return (BITS_PER_LONG - 1 - __fls(val)) >> 3; } To be clear, when I said that upstream LZ4 shouldn't generally be changed, I'm primarily talking about the core code, not the platform-specific parts. What we need to do is define platform-specific stuff, like LZ4_read*(), LZ4_write*(), LZ4_NbCommonBytes(), LZ4_64bits(), and FORCE_INLINE, in a way that makes sense for the Linux kernel and the environment it's compiled in. Also I think it's fine, and maybe even necessary for performance, to *add* inline or FORCE_INLINE in some places too, given that LZ4 in Linux may get compiled with a lower optimization level than that intended to be used for upstream LZ4 --- though it may be worth considering updating the Makefile to just always compile the LZ4 files with -O3 instead. What should be avoided is making unnecessary changes to the *users* of the platform-specific code or to the core (de)compression parameters or templates. Eric