Re: [PATCH 1/1] lz4: Implement lz4 with dynamic offset length.

Nick Terrell <terrelln@xxxxxx> · Wed, 21 Mar 2018 12:59:06 -0700

On (03/21/18 10:10), Maninder Singh wrote:
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4..185c358 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -183,7 +183,8 @@ static FORCE_INLINE int LZ4_compress_generic(
>  	const tableType_t tableType,
>  	const dict_directive dict,
>  	const dictIssue_directive dictIssue,
> -	const U32 acceleration)
> +	const U32 acceleration,
> +	const Dynamic_Offset dynOffset)
>  {
>  	const BYTE *ip = (const BYTE *) source;
>  	const BYTE *base;
> @@ -199,6 +200,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>
>  	BYTE *op = (BYTE *) dest;
>  	BYTE * const olimit = op + maxOutputSize;
> +	int max_distance = dynOffset ? MAX_DISTANCE_DYN : MAX_DISTANCE;

Lets mark this variable `const`. If the compiler doesn't realize that this
variable and `dynOffset` are compile time constants, I expect the speed to
be impacted.

>
>  	U32 forwardH;
>  	size_t refDelta = 0;
> @@ -245,6 +247,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>  	for ( ; ; ) {
>  		const BYTE *match;
>  		BYTE *token;
> +		int curr_offset;
>
>  		/* Find a match */
>  		{
> @@ -285,7 +288,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>  					: 0)
>  				|| ((tableType == byU16)
>  					? 0
> -					: (match + MAX_DISTANCE < ip))
> +					: (match + max_distance < ip))
>  				|| (LZ4_read32(match + refDelta)
>  					!= LZ4_read32(ip)));
>  		}
> @@ -328,8 +331,26 @@ static FORCE_INLINE int LZ4_compress_generic(
>
>  _next_match:
>  		/* Encode Offset */
> -		LZ4_writeLE16(op, (U16)(ip - match));
> -		op += 2;
> +		if (dynOffset) {
> +			curr_offset = (U16)(ip - match);
> +
> +			/*
> +			 * If Ofsset is greater than 127, we need 2 bytes
> +			 * to store it. Otherwise 1 byte is enough.
> +			 */
> +			if (curr_offset > 127) {
> +				curr_offset = (curr_offset << 1) | DYN_BIT;
> +				LZ4_writeLE16(op, (U16)curr_offset);
> +				op += 2;
> +			} else {
> +				curr_offset = curr_offset << 1;
> +				*op = (BYTE)curr_offset;
> +				op++;
> +			}

The standard way to do variable sized integers is to use the high-bit as
the control bit, not the low-bit. Do you have benchmarks to show that using
the low-bit is faster than using the high-bit? If so, lets comment in the
code, if not lets use the high-bit.

> +		} else {
> +			LZ4_writeLE16(op, (U16)(ip - match));
> +			op += 2;
> +		}
>
>  		/* Encode MatchLength */
>  		{
> @@ -480,39 +501,70 @@ static int LZ4_compress_fast_extState(
>  			return LZ4_compress_generic(ctx, source,
>  				dest, inputSize, 0,
>  				noLimit, byU16, noDict,
> -				noDictIssue, acceleration);
> +				noDictIssue, acceleration, NoDynOffset);
>  		else
>  			return LZ4_compress_generic(ctx, source,
>  				dest, inputSize, 0,
>  				noLimit, tableType, noDict,
> -				noDictIssue, acceleration);
> +				noDictIssue, acceleration, NoDynOffset);
>  	} else {
>  		if (inputSize < LZ4_64Klimit)
>  			return LZ4_compress_generic(ctx, source,
>  				dest, inputSize,
>  				maxOutputSize, limitedOutput, byU16, noDict,
> -				noDictIssue, acceleration);
> +				noDictIssue, acceleration, NoDynOffset);
>  		else
>  			return LZ4_compress_generic(ctx, source,
>  				dest, inputSize,
>  				maxOutputSize, limitedOutput, tableType, noDict,
> -				noDictIssue, acceleration);
> +				noDictIssue, acceleration, NoDynOffset);
>  	}
>  }
>
> +static int LZ4_compress_fast_extState_dynamic(
> +	void *state,
> +	const char *source,
> +	char *dest,
> +	int inputSize,
> +	int maxOutputSize,
> +	int acceleration)
> +{
> +	LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse;
> +
> +	LZ4_resetStream((LZ4_stream_t *)state);
> +
> +	if (acceleration < 1)
> +		acceleration = LZ4_ACCELERATION_DEFAULT;
> +
> +	if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize))
> +		return LZ4_compress_generic(ctx, source,
> +			dest, inputSize, 0,
> +			noLimit, byU16, noDict,
> +			noDictIssue, acceleration, DynOffset);
> +	else
> +		return LZ4_compress_generic(ctx, source,
> +			dest, inputSize,
> +			maxOutputSize, limitedOutput, byU16, noDict,
> +			noDictIssue, acceleration, DynOffset);
> +}
> +
>  int LZ4_compress_fast(const char *source, char *dest, int inputSize,
> -	int maxOutputSize, int acceleration, void *wrkmem)
> +	int maxOutputSize, int acceleration, void *wrkmem, bool dynOffset)
>  {
> -	return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
> +	if (!dynOffset)
> +		return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
> +			maxOutputSize, acceleration);
> +
> +	return LZ4_compress_fast_extState_dynamic(wrkmem, source, dest, inputSize,
>  		maxOutputSize, acceleration);
>  }
>  EXPORT_SYMBOL(LZ4_compress_fast);
>
>  int LZ4_compress_default(const char *source, char *dest, int inputSize,
> -	int maxOutputSize, void *wrkmem)
> +	int maxOutputSize, void *wrkmem, bool dynOffset)
>  {
>  	return LZ4_compress_fast(source, dest, inputSize,
> -		maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem);
> +		maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem, dynOffset);
>  }
>  EXPORT_SYMBOL(LZ4_compress_default);
>
> @@ -900,12 +952,12 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
>  			result = LZ4_compress_generic(
>  				streamPtr, source, dest, inputSize,
>  				maxOutputSize, limitedOutput, byU32,
> -				withPrefix64k, dictSmall, acceleration);
> +				withPrefix64k, dictSmall, acceleration, NoDynOffset);
>  		} else {
>  			result = LZ4_compress_generic(
>  				streamPtr, source, dest, inputSize,
>  				maxOutputSize, limitedOutput, byU32,
> -				withPrefix64k, noDictIssue, acceleration);
> +				withPrefix64k, noDictIssue, acceleration, NoDynOffset);
>  		}
>  		streamPtr->dictSize += (U32)inputSize;
>  		streamPtr->currentOffset += (U32)inputSize;
> @@ -921,12 +973,12 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
>  			result = LZ4_compress_generic(
>  				streamPtr, source, dest, inputSize,
>  				maxOutputSize, limitedOutput, byU32,
> -				usingExtDict, dictSmall, acceleration);
> +				usingExtDict, dictSmall, acceleration, NoDynOffset);
>  		} else {
>  			result = LZ4_compress_generic(
>  				streamPtr, source, dest, inputSize,
>  				maxOutputSize, limitedOutput, byU32,
> -				usingExtDict, noDictIssue, acceleration);
> +				usingExtDict, noDictIssue, acceleration, NoDynOffset);
>  		}
>  		streamPtr->dictionary = (const BYTE *)source;
>  		streamPtr->dictSize = (U32)inputSize;
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 141734d..337a828 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -71,7 +71,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	 /* only if dict == usingExtDict */
>  	 const BYTE * const dictStart,
>  	 /* note : = 0 if noDict */
> -	 const size_t dictSize
> +	 const size_t dictSize,
> +	 /* offset == 1; dynamic offset */
> +	 const Dynamic_Offset dynOffset
>  	 )
>  {
>  	/* Local Variables */
> @@ -141,8 +143,8 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		/* copy literals */
>  		cpy = op + length;
>  		if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +			|| (ip + length > iend - (2 + LASTLITERALS))))
> +			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH - 1))) {
>  			if (partialDecoding) {
>  				if (cpy > oend) {
>  					/*
> @@ -188,13 +190,31 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			break;
>  		}
>
> -		LZ4_wildCopy(op, ip, cpy);
> +		if (dynOffset && length < 4)
> +			LZ4_copy4(op, ip);
> +		else
> +			LZ4_wildCopy(op, ip, cpy);
> +

The LZ4 format enforces that the last 5 bytes are literals so that
LZ4_wildCopy() can be used here. I suspect that having this extra branch
here for `dynOffset` mode hurts decompression speed.

>  		ip += length;
>  		op = cpy;
>
>  		/* get offset */
> -		offset = LZ4_readLE16(ip);
> -		ip += 2;
> +		if (dynOffset) {
> +			/*
> +			 * Check if DYN_BIT is set, means 2 Byte Offset,
> +			 * else 1 Byte Offset.
> +			 */
> +			if (*ip & DYN_BIT) {
> +				offset = LZ4_readLE16(ip) >> 1;
> +				ip += 2;
> +			} else {
> +				offset = *ip >> 1;
> +				ip += 1;

If we use the high-bit as the control bit, this branch simply becomes
`offset = *ip`, though the long offset branch becomes a bit longer.

> +			}
> +		} else {
> +			offset = LZ4_readLE16(ip);
> +			ip += 2;
> +		}
>  		match = op - offset;
>
>  		if ((checkOffset) && (unlikely(match < lowLimit))) {
> @@ -335,11 +355,11 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  }
>
>  int LZ4_decompress_safe(const char *source, char *dest,
> -	int compressedSize, int maxDecompressedSize)
> +	int compressedSize, int maxDecompressedSize, bool dynOffset)
>  {
>  	return LZ4_decompress_generic(source, dest, compressedSize,
>  		maxDecompressedSize, endOnInputSize, full, 0,
> -		noDict, (BYTE *)dest, NULL, 0);
> +		noDict, (BYTE *)dest, NULL, 0, dynOffset);

You'll need to use the same trick that LZ4_compress_fast() uses, by hard
coding `dynOffset`. We want the compiler to generate too version of
LZ4_decompress_generic(), one with `dynOffset` and one with `!dynOffset`.
That way the tight loop won't the branches that check `dynOffset`.

	if (dynOffset)
		return LZ4_decompress_generic(..., true);
	else
		return LZ4_decompress_generic(..., false);

Without this trick, I expect that this patch causes a regression to both
LZ4 and LZ4_DYN decompression speed.

>  }
>
>  int LZ4_decompress_safe_partial(const char *source, char *dest,
> @@ -347,14 +367,14 @@ int LZ4_decompress_safe_partial(const char *source, char *dest,
>  {
>  	return LZ4_decompress_generic(source, dest, compressedSize,
>  		maxDecompressedSize, endOnInputSize, partial,
> -		targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
> +		targetOutputSize, noDict, (BYTE *)dest, NULL, 0, NoDynOffset);
>  }
>
>  int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
>  {
>  	return LZ4_decompress_generic(source, dest, 0, originalSize,
>  		endOnOutputSize, full, 0, withPrefix64k,
> -		(BYTE *)(dest - 64 * KB), NULL, 64 * KB);
> +		(BYTE *)(dest - 64 * KB), NULL, 64 * KB, NoDynOffset);
>  }
>
>  int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
> @@ -392,7 +412,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>  			endOnInputSize, full, 0,
>  			usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
>  			lz4sd->externalDict,
> -			lz4sd->extDictSize);
> +			lz4sd->extDictSize, NoDynOffset);
>
>  		if (result <= 0)
>  			return result;
> @@ -406,7 +426,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>  			compressedSize, maxOutputSize,
>  			endOnInputSize, full, 0,
>  			usingExtDict, (BYTE *)dest,
> -			lz4sd->externalDict, lz4sd->extDictSize);
> +			lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>  		if (result <= 0)
>  			return result;
>  		lz4sd->prefixSize = result;
> @@ -427,7 +447,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>  			endOnOutputSize, full, 0,
>  			usingExtDict,
>  			lz4sd->prefixEnd - lz4sd->prefixSize,
> -			lz4sd->externalDict, lz4sd->extDictSize);
> +			lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>
>  		if (result <= 0)
>  			return result;
> @@ -440,7 +460,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>  		result = LZ4_decompress_generic(source, dest, 0, originalSize,
>  			endOnOutputSize, full, 0,
>  			usingExtDict, (BYTE *)dest,
> -			lz4sd->externalDict, lz4sd->extDictSize);
> +			lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>  		if (result <= 0)
>  			return result;
>  		lz4sd->prefixSize = originalSize;
> @@ -463,19 +483,19 @@ static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
>  	if (dictSize == 0)
>  		return LZ4_decompress_generic(source, dest,
>  			compressedSize, maxOutputSize, safe, full, 0,
> -			noDict, (BYTE *)dest, NULL, 0);
> +			noDict, (BYTE *)dest, NULL, 0, NoDynOffset);
>  	if (dictStart + dictSize == dest) {
>  		if (dictSize >= (int)(64 * KB - 1))
>  			return LZ4_decompress_generic(source, dest,
>  				compressedSize, maxOutputSize, safe, full, 0,
> -				withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
> +				withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0, NoDynOffset);
>  		return LZ4_decompress_generic(source, dest, compressedSize,
>  			maxOutputSize, safe, full, 0, noDict,
> -			(BYTE *)dest - dictSize, NULL, 0);
> +			(BYTE *)dest - dictSize, NULL, 0, NoDynOffset);
>  	}
>  	return LZ4_decompress_generic(source, dest, compressedSize,
>  		maxOutputSize, safe, full, 0, usingExtDict,
> -		(BYTE *)dest, (const BYTE *)dictStart, dictSize);
> +		(BYTE *)dest, (const BYTE *)dictStart, dictSize, NoDynOffset);
>  }
>
>  int LZ4_decompress_safe_usingDict(const char *source, char *dest,
> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
> index 00a0b58..9451a73 100644
> --- a/lib/lz4/lz4defs.h
> +++ b/lib/lz4/lz4defs.h
> @@ -75,6 +75,7 @@
>  #define WILDCOPYLENGTH 8
>  #define LASTLITERALS 5
>  #define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
> +#define DYN_BIT 0x1
>
>  /* Increase this value ==> compression run slower on incompressible data */
>  #define LZ4_SKIPTRIGGER 6
> @@ -87,6 +88,7 @@
>
>  #define MAXD_LOG 16
>  #define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
> +#define MAX_DISTANCE_DYN ((1 << (MAXD_LOG - 1)) - 1)
>  #define STEPSIZE sizeof(size_t)
>
>  #define ML_BITS	4
> @@ -147,6 +149,13 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
>  #endif
>  }
>
> +static FORCE_INLINE void LZ4_copy4(void *dst, const void *src)
> +{
> +	U32 a = get_unaligned((const U32 *)src);
> +
> +	put_unaligned(a, (U32 *)dst);
> +}
> +
>  /*
>   * customized variant of memcpy,
>   * which can overwrite up to 7 bytes beyond dstEnd
> @@ -224,4 +233,6 @@ static FORCE_INLINE unsigned int LZ4_count(
>  typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
>  typedef enum { full = 0, partial = 1 } earlyEnd_directive;
>
> +typedef enum { NoDynOffset = 0, DynOffset = 1 } Dynamic_Offset;
> +
>  #endif
> --
> 1.7.1