Re: [PATCH 1/1] lz4: Implement lz4 with dynamic offset length.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



CC: Vaneet Narang.

 
On (03/21/18 10:10), Maninder Singh wrote:
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4..185c358 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -183,7 +183,8 @@ static FORCE_INLINE int LZ4_compress_generic(
>          const tableType_t tableType,
>          const dict_directive dict,
>          const dictIssue_directive dictIssue,
> -        const U32 acceleration)
> +        const U32 acceleration,
> +        const Dynamic_Offset dynOffset)
>  {
>          const BYTE *ip = (const BYTE *) source;
>          const BYTE *base;
> @@ -199,6 +200,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>
>          BYTE *op = (BYTE *) dest;
>          BYTE * const olimit = op + maxOutputSize;
> +        int max_distance = dynOffset ? MAX_DISTANCE_DYN : MAX_DISTANCE;
 
Lets mark this variable `const`. If the compiler doesn't realize that this
variable and `dynOffset` are compile time constants, I expect the speed to
be impacted.
 
>
>          U32 forwardH;
>          size_t refDelta = 0;
> @@ -245,6 +247,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>          for ( ; ; ) {
>                  const BYTE *match;
>                  BYTE *token;
> +                int curr_offset;
>
>                  /* Find a match */
>                  {
> @@ -285,7 +288,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>                                          : 0)
>                                  || ((tableType == byU16)
>                                          ? 0
> -                                        : (match + MAX_DISTANCE < ip))
> +                                        : (match + max_distance < ip))
>                                  || (LZ4_read32(match + refDelta)
>                                          != LZ4_read32(ip)));
>                  }
> @@ -328,8 +331,26 @@ static FORCE_INLINE int LZ4_compress_generic(
>
>  _next_match:
>                  /* Encode Offset */
> -                LZ4_writeLE16(op, (U16)(ip - match));
> -                op += 2;
> +                if (dynOffset) {
> +                        curr_offset = (U16)(ip - match);
> +
> +                        /*
> +                         * If Ofsset is greater than 127, we need 2 bytes
> +                         * to store it. Otherwise 1 byte is enough.
> +                         */
> +                        if (curr_offset > 127) {
> +                                curr_offset = (curr_offset << 1) | DYN_BIT;
> +                                LZ4_writeLE16(op, (U16)curr_offset);
> +                                op += 2;
> +                        } else {
> +                                curr_offset = curr_offset << 1;
> +                                *op = (BYTE)curr_offset;
> +                                op++;
> +                        }
 
The standard way to do variable sized integers is to use the high-bit as
the control bit, not the low-bit. Do you have benchmarks to show that using
the low-bit is faster than using the high-bit? If so, lets comment in the
code, if not lets use the high-bit.
 
> +                } else {
> +                        LZ4_writeLE16(op, (U16)(ip - match));
> +                        op += 2;
> +                }
>
>                  /* Encode MatchLength */
>                  {
> @@ -480,39 +501,70 @@ static int LZ4_compress_fast_extState(
>                          return LZ4_compress_generic(ctx, source,
>                                  dest, inputSize, 0,
>                                  noLimit, byU16, noDict,
> -                                noDictIssue, acceleration);
> +                                noDictIssue, acceleration, NoDynOffset);
>                  else
>                          return LZ4_compress_generic(ctx, source,
>                                  dest, inputSize, 0,
>                                  noLimit, tableType, noDict,
> -                                noDictIssue, acceleration);
> +                                noDictIssue, acceleration, NoDynOffset);
>          } else {
>                  if (inputSize < LZ4_64Klimit)
>                          return LZ4_compress_generic(ctx, source,
>                                  dest, inputSize,
>                                  maxOutputSize, limitedOutput, byU16, noDict,
> -                                noDictIssue, acceleration);
> +                                noDictIssue, acceleration, NoDynOffset);
>                  else
>                          return LZ4_compress_generic(ctx, source,
>                                  dest, inputSize,
>                                  maxOutputSize, limitedOutput, tableType, noDict,
> -                                noDictIssue, acceleration);
> +                                noDictIssue, acceleration, NoDynOffset);
>          }
>  }
>
> +static int LZ4_compress_fast_extState_dynamic(
> +        void *state,
> +        const char *source,
> +        char *dest,
> +        int inputSize,
> +        int maxOutputSize,
> +        int acceleration)
> +{
> +        LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse;
> +
> +        LZ4_resetStream((LZ4_stream_t *)state);
> +
> +        if (acceleration < 1)
> +                acceleration = LZ4_ACCELERATION_DEFAULT;
> +
> +        if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize))
> +                return LZ4_compress_generic(ctx, source,
> +                        dest, inputSize, 0,
> +                        noLimit, byU16, noDict,
> +                        noDictIssue, acceleration, DynOffset);
> +        else
> +                return LZ4_compress_generic(ctx, source,
> +                        dest, inputSize,
> +                        maxOutputSize, limitedOutput, byU16, noDict,
> +                        noDictIssue, acceleration, DynOffset);
> +}
> +
>  int LZ4_compress_fast(const char *source, char *dest, int inputSize,
> -        int maxOutputSize, int acceleration, void *wrkmem)
> +        int maxOutputSize, int acceleration, void *wrkmem, bool dynOffset)
>  {
> -        return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
> +        if (!dynOffset)
> +                return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
> +                        maxOutputSize, acceleration);
> +
> +        return LZ4_compress_fast_extState_dynamic(wrkmem, source, dest, inputSize,
>                  maxOutputSize, acceleration);
>  }
>  EXPORT_SYMBOL(LZ4_compress_fast);
>
>  int LZ4_compress_default(const char *source, char *dest, int inputSize,
> -        int maxOutputSize, void *wrkmem)
> +        int maxOutputSize, void *wrkmem, bool dynOffset)
>  {
>          return LZ4_compress_fast(source, dest, inputSize,
> -                maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem);
> +                maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem, dynOffset);
>  }
>  EXPORT_SYMBOL(LZ4_compress_default);
>
> @@ -900,12 +952,12 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
>                          result = LZ4_compress_generic(
>                                  streamPtr, source, dest, inputSize,
>                                  maxOutputSize, limitedOutput, byU32,
> -                                withPrefix64k, dictSmall, acceleration);
> +                                withPrefix64k, dictSmall, acceleration, NoDynOffset);
>                  } else {
>                          result = LZ4_compress_generic(
>                                  streamPtr, source, dest, inputSize,
>                                  maxOutputSize, limitedOutput, byU32,
> -                                withPrefix64k, noDictIssue, acceleration);
> +                                withPrefix64k, noDictIssue, acceleration, NoDynOffset);
>                  }
>                  streamPtr->dictSize += (U32)inputSize;
>                  streamPtr->currentOffset += (U32)inputSize;
> @@ -921,12 +973,12 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
>                          result = LZ4_compress_generic(
>                                  streamPtr, source, dest, inputSize,
>                                  maxOutputSize, limitedOutput, byU32,
> -                                usingExtDict, dictSmall, acceleration);
> +                                usingExtDict, dictSmall, acceleration, NoDynOffset);
>                  } else {
>                          result = LZ4_compress_generic(
>                                  streamPtr, source, dest, inputSize,
>                                  maxOutputSize, limitedOutput, byU32,
> -                                usingExtDict, noDictIssue, acceleration);
> +                                usingExtDict, noDictIssue, acceleration, NoDynOffset);
>                  }
>                  streamPtr->dictionary = (const BYTE *)source;
>                  streamPtr->dictSize = (U32)inputSize;
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 141734d..337a828 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -71,7 +71,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
>           /* only if dict == usingExtDict */
>           const BYTE * const dictStart,
>           /* note : = 0 if noDict */
> -         const size_t dictSize
> +         const size_t dictSize,
> +         /* offset == 1; dynamic offset */
> +         const Dynamic_Offset dynOffset
>           )
>  {
>          /* Local Variables */
> @@ -141,8 +143,8 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                  /* copy literals */
>                  cpy = op + length;
>                  if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
> -                        || (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -                        || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +                        || (ip + length > iend - (2 + LASTLITERALS))))
> +                        || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH - 1))) {
>                          if (partialDecoding) {
>                                  if (cpy > oend) {
>                                          /*
> @@ -188,13 +190,31 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                          break;
>                  }
>
> -                LZ4_wildCopy(op, ip, cpy);
> +                if (dynOffset && length < 4)
> +                        LZ4_copy4(op, ip);
> +                else
> +                        LZ4_wildCopy(op, ip, cpy);
> +
 
The LZ4 format enforces that the last 5 bytes are literals so that
LZ4_wildCopy() can be used here. I suspect that having this extra branch
here for `dynOffset` mode hurts decompression speed.
 
>                  ip += length;
>                  op = cpy;
>
>                  /* get offset */
> -                offset = LZ4_readLE16(ip);
> -                ip += 2;
> +                if (dynOffset) {
> +                        /*
> +                         * Check if DYN_BIT is set, means 2 Byte Offset,
> +                         * else 1 Byte Offset.
> +                         */
> +                        if (*ip & DYN_BIT) {
> +                                offset = LZ4_readLE16(ip) >> 1;
> +                                ip += 2;
> +                        } else {
> +                                offset = *ip >> 1;
> +                                ip += 1;
 
If we use the high-bit as the control bit, this branch simply becomes
`offset = *ip`, though the long offset branch becomes a bit longer.
 
> +                        }
> +                } else {
> +                        offset = LZ4_readLE16(ip);
> +                        ip += 2;
> +                }
>                  match = op - offset;
>
>                  if ((checkOffset) && (unlikely(match < lowLimit))) {
> @@ -335,11 +355,11 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  }
>
>  int LZ4_decompress_safe(const char *source, char *dest,
> -        int compressedSize, int maxDecompressedSize)
> +        int compressedSize, int maxDecompressedSize, bool dynOffset)
>  {
>          return LZ4_decompress_generic(source, dest, compressedSize,
>                  maxDecompressedSize, endOnInputSize, full, 0,
> -                noDict, (BYTE *)dest, NULL, 0);
> +                noDict, (BYTE *)dest, NULL, 0, dynOffset);
 
You'll need to use the same trick that LZ4_compress_fast() uses, by hard
coding `dynOffset`. We want the compiler to generate too version of
LZ4_decompress_generic(), one with `dynOffset` and one with `!dynOffset`.
That way the tight loop won't the branches that check `dynOffset`.
 
        if (dynOffset)
                return LZ4_decompress_generic(..., true);
        else
                return LZ4_decompress_generic(..., false);
 
Without this trick, I expect that this patch causes a regression to both
LZ4 and LZ4_DYN decompression speed.
 
>  }
>
>  int LZ4_decompress_safe_partial(const char *source, char *dest,
> @@ -347,14 +367,14 @@ int LZ4_decompress_safe_partial(const char *source, char *dest,
>  {
>          return LZ4_decompress_generic(source, dest, compressedSize,
>                  maxDecompressedSize, endOnInputSize, partial,
> -                targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
> +                targetOutputSize, noDict, (BYTE *)dest, NULL, 0, NoDynOffset);
>  }
>
>  int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
>  {
>          return LZ4_decompress_generic(source, dest, 0, originalSize,
>                  endOnOutputSize, full, 0, withPrefix64k,
> -                (BYTE *)(dest - 64 * KB), NULL, 64 * KB);
> +                (BYTE *)(dest - 64 * KB), NULL, 64 * KB, NoDynOffset);
>  }
>
>  int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
> @@ -392,7 +412,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>                          endOnInputSize, full, 0,
>                          usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
>                          lz4sd->externalDict,
> -                        lz4sd->extDictSize);
> +                        lz4sd->extDictSize, NoDynOffset);
>
>                  if (result <= 0)
>                          return result;
> @@ -406,7 +426,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>                          compressedSize, maxOutputSize,
>                          endOnInputSize, full, 0,
>                          usingExtDict, (BYTE *)dest,
> -                        lz4sd->externalDict, lz4sd->extDictSize);
> +                        lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>                  if (result <= 0)
>                          return result;
>                  lz4sd->prefixSize = result;
> @@ -427,7 +447,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>                          endOnOutputSize, full, 0,
>                          usingExtDict,
>                          lz4sd->prefixEnd - lz4sd->prefixSize,
> -                        lz4sd->externalDict, lz4sd->extDictSize);
> +                        lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>
>                  if (result <= 0)
>                          return result;
> @@ -440,7 +460,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
>                  result = LZ4_decompress_generic(source, dest, 0, originalSize,
>                          endOnOutputSize, full, 0,
>                          usingExtDict, (BYTE *)dest,
> -                        lz4sd->externalDict, lz4sd->extDictSize);
> +                        lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>                  if (result <= 0)
>                          return result;
>                  lz4sd->prefixSize = originalSize;
> @@ -463,19 +483,19 @@ static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
>          if (dictSize == 0)
>                  return LZ4_decompress_generic(source, dest,
>                          compressedSize, maxOutputSize, safe, full, 0,
> -                        noDict, (BYTE *)dest, NULL, 0);
> +                        noDict, (BYTE *)dest, NULL, 0, NoDynOffset);
>          if (dictStart + dictSize == dest) {
>                  if (dictSize >= (int)(64 * KB - 1))
>                          return LZ4_decompress_generic(source, dest,
>                                  compressedSize, maxOutputSize, safe, full, 0,
> -                                withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
> +                                withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0, NoDynOffset);
>                  return LZ4_decompress_generic(source, dest, compressedSize,
>                          maxOutputSize, safe, full, 0, noDict,
> -                        (BYTE *)dest - dictSize, NULL, 0);
> +                        (BYTE *)dest - dictSize, NULL, 0, NoDynOffset);
>          }
>          return LZ4_decompress_generic(source, dest, compressedSize,
>                  maxOutputSize, safe, full, 0, usingExtDict,
> -                (BYTE *)dest, (const BYTE *)dictStart, dictSize);
> +                (BYTE *)dest, (const BYTE *)dictStart, dictSize, NoDynOffset);
>  }
>
>  int LZ4_decompress_safe_usingDict(const char *source, char *dest,
> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
> index 00a0b58..9451a73 100644
> --- a/lib/lz4/lz4defs.h
> +++ b/lib/lz4/lz4defs.h
> @@ -75,6 +75,7 @@
>  #define WILDCOPYLENGTH 8
>  #define LASTLITERALS 5
>  #define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
> +#define DYN_BIT 0x1
>
>  /* Increase this value ==> compression run slower on incompressible data */
>  #define LZ4_SKIPTRIGGER 6
> @@ -87,6 +88,7 @@
>
>  #define MAXD_LOG 16
>  #define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
> +#define MAX_DISTANCE_DYN ((1 << (MAXD_LOG - 1)) - 1)
>  #define STEPSIZE sizeof(size_t)
>
>  #define ML_BITS        4
> @@ -147,6 +149,13 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
>  #endif
>  }
>
> +static FORCE_INLINE void LZ4_copy4(void *dst, const void *src)
> +{
> +        U32 a = get_unaligned((const U32 *)src);
> +
> +        put_unaligned(a, (U32 *)dst);
> +}
> +
>  /*
>   * customized variant of memcpy,
>   * which can overwrite up to 7 bytes beyond dstEnd
> @@ -224,4 +233,6 @@ static FORCE_INLINE unsigned int LZ4_count(
>  typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
>  typedef enum { full = 0, partial = 1 } earlyEnd_directive;
>
> +typedef enum { NoDynOffset = 0, DynOffset = 1 } Dynamic_Offset;
> +
>  #endif
> --
> 1.7.1
 
 




[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]

  Powered by Linux