[PATCH] makedumpfile: Improve performance for parallel compression with zlib.

zhouwj-fnst@xxxxxxxxxxxxxx ("Zhou, Wenjian/周文?") · Thu, 22 Oct 2015 11:11:10 +0800

Hello Kumagai,

I test it, and it works well. The following is the results.

in virtual machine(memory 2G):
with empty memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    12.76
            devel      |       1       |    19.29
            devel      |       2       |    11.56
        + this patch   |       0       |    12.85
        + this patch   |       1       |     5.61
        + this patch   |       2       |     2.68

with full memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    51.18
            devel      |       1       |    57.82
            devel      |       2       |    41.54
        + this patch   |       0       |    49.25
        + this patch   |       1       |    44.80
        + this patch   |       2       |    33.87

in real machine(memory 16G):
with empty memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    86.12
            devel      |       1       |   222.37
            devel      |       8       |    81.50
            devel      |       16      |    98.44
        + this patch   |       0       |    86.07
        + this patch   |       1       |    84.33
        + this patch   |       8       |    14.95
        + this patch   |       16      |    13.96

with full memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |   540.89
            devel      |       1       |   715.25
            devel      |       8       |   132.54
            devel      |       16      |   112.89
        + this patch   |       0       |   542.79
        + this patch   |       1       |   538.22
        + this patch   |       8       |   108.28
        + this patch   |       16      |   107.83

-- 
Thanks
Zhou

On 10/14/2015 01:24 PM, Atsushi Kumagai wrote:
> Hello,
> 
> I have improved the performance issue of parallel compression
> which we faced in:
> 
>    http://lists.infradead.org/pipermail/kexec/2015-July/014137.html
> 
> The cause of the issue is that compress2() calls malloc() and free()
> for a temp buffer in each call, it can cause many page faults since
> makedumpfile has to call compress2() for each page.
> 
> It's easy to avoid the issue, just divide compress2() into three
> functions as initialization part, compression part and finalization
> part. Then we don't need to call the initialization function and the
> finalization function for each page.
> 
> In order to benchmark, I measured the execution time and the number of
> page faults by *perf stat -e page-faults* on the current devel branch(v1.5.8+).
> 
> The result is here:
> 
>    CPU:   Dual-Core AMD Opteron(tm) Processor 2218 @ 2.6GHz (4 cores)
>    Memory:  5GB
>    zlib:  1.2.3-29
>    glibc: 2.12-1.132
> 
>          version      |  num-threads  |  time(sec)  |   page-faults
>       ----------------+---------------+-------------+------------------
>            devel      |       1       |   133.96    |    21,801,120
>            devel      |       3       |    87.25    |    21,801,150
>        + this patch   |       1       |    47.80    |     1,036,408
>        + this patch   |       3       |    39.14    |     1,036,478
> 
> 
> Thanks
> Atsushi Kumagai
> 
> 
> From: Atsushi Kumagai <ats-kumagai at wm.jp.nec.com>
> Date: Thu, 8 Oct 2015 15:06:08 +0900
> Subject: [PATCH] Improve performance for parallel compression with zlib.
> 
> compress2() allocates a buffer, compresses a input data and
> deallocates the buffer in each call. makedumpfile has to call
> compress2() for each page, it can cause big performance
> degradation due to many page faults. This issue will be
> especially apparent in the case of multi thread compression
> since per-thread arena is easy to be grown and trimmed compared
> with main arena.
> 
> Fortunately, the zlib functions called in compress2() are global,
> it's easy to extract the allocation and deallocation part from
> compress2().
> 
> Signed-off-by: Atsushi Kumagai <ats-kumagai at wm.jp.nec.com>
> ---
>   makedumpfile.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   makedumpfile.h |  4 ++++
>   2 files changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 06c8baf..fa0b779 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -25,6 +25,7 @@
>   #include <sys/time.h>
>   #include <limits.h>
>   #include <assert.h>
> +#include <zlib.h>
>   
>   struct symbol_table	symbol_table;
>   struct size_table	size_table;
> @@ -3538,6 +3539,11 @@ initial_for_parallel()
>   		MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
>   		MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
>   
> +		if (initialize_zlib(&ZLIB_STREAM_PARALLEL(i), Z_BEST_SPEED) == FALSE) {
> +			ERRMSG("zlib initialization failed.\n");
> +			return FALSE;
> +		}
> +
>   #ifdef USELZO
>   		if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) {
>   			MSG("Can't allocate memory for the working memory. %s\n",
> @@ -3628,6 +3634,7 @@ free_for_parallel()
>   
>   				free(MMAP_CACHE_PARALLEL(i));
>   			}
> +			finalize_zlib(&ZLIB_STREAM_PARALLEL(i));
>   #ifdef USELZO
>   			if (WRKMEM_PARALLEL(i) != NULL)
>   				free(WRKMEM_PARALLEL(i));
> @@ -7017,6 +7024,53 @@ write_kdump_page(struct cache_data *cd_header, struct cache_data *cd_page,
>   	return TRUE;
>   }
>   
> +int initialize_zlib(z_stream *stream, int level)
> +{
> +	int err;
> +
> +	stream->zalloc = (alloc_func)Z_NULL;
> +	stream->zfree = (free_func)Z_NULL;
> +	stream->opaque = (voidpf)Z_NULL;
> +
> +	err = deflateInit(stream, level);
> +	if (err != Z_OK) {
> +		ERRMSG("deflateInit failed: %s\n", zError(err));
> +		return FALSE;
> +	}
> +	return TRUE;
> +}
> +
> +int compress_mdf (z_stream *stream, Bytef *dest, uLongf *destLen,
> +		  const Bytef *source, uLong sourceLen, int level)
> +{
> +	int err;
> +	stream->next_in = (Bytef*)source;
> +	stream->avail_in = (uInt)sourceLen;
> +	stream->next_out = dest;
> +	stream->avail_out = (uInt)*destLen;
> +	if ((uLong)stream->avail_out != *destLen)
> +		return Z_BUF_ERROR;
> +
> +	err = deflate(stream, Z_FINISH);
> +
> +	if (err != Z_STREAM_END) {
> +		deflateReset(stream);
> +		return err == Z_OK ? Z_BUF_ERROR : err;
> +	}
> +	*destLen = stream->total_out;
> +
> +	err = deflateReset(stream);
> +	return err;
> +}
> +
> +int finalize_zlib(z_stream *stream)
> +{
> +	int err;
> +	err = deflateEnd(stream);
> +
> +	return err;
> +}
> +
>   void *
>   kdump_thread_function_cyclic(void *arg) {
>   	void *retval = PTHREAD_FAIL;
> @@ -7035,6 +7089,7 @@ kdump_thread_function_cyclic(void *arg) {
>   	struct mmap_cache *mmap_cache =
>   			MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
>   	unsigned long size_out;
> +	z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num);
>   #ifdef USELZO
>   	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
>   #endif
> @@ -7135,7 +7190,7 @@ kdump_thread_function_cyclic(void *arg) {
>   			size_out = kdump_thread_args->len_buf_out;
>   			if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
>   			    && ((size_out = kdump_thread_args->len_buf_out),
> -				compress2(buf_out, &size_out, buf,
> +				compress_mdf(stream, buf_out, &size_out, buf,
>   					  info->page_size,
>   					  Z_BEST_SPEED) == Z_OK)
>   			    && (size_out < info->page_size)) {
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 0bd6425..cb8f0f3 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -438,6 +438,7 @@ do { \
>   #define BUF_PARALLEL(i)			info->parallel_info[i].buf
>   #define BUF_OUT_PARALLEL(i)		info->parallel_info[i].buf_out
>   #define MMAP_CACHE_PARALLEL(i)		info->parallel_info[i].mmap_cache
> +#define ZLIB_STREAM_PARALLEL(i)		info->parallel_info[i].zlib_stream
>   #ifdef USELZO
>   #define WRKMEM_PARALLEL(i)		info->parallel_info[i].wrkmem
>   #endif
> @@ -1050,6 +1051,7 @@ struct parallel_info {
>   	unsigned char		*buf;
>   	unsigned char 		*buf_out;
>   	struct mmap_cache	*mmap_cache;
> +	z_stream		zlib_stream;
>   #ifdef USELZO
>   	lzo_bytep		wrkmem;
>   #endif
> @@ -2051,5 +2053,7 @@ int initial_xen(void);
>   unsigned long long get_free_memory_size(void);
>   int calculate_cyclic_buffer_size(void);
>   int prepare_splitblock_table(void);
> +int initialize_zlib(z_stream *stream, int level);
> +int finalize_zlib(z_stream *stream);
>   
>   #endif /* MAKEDUMPFILE_H */
>