Hello Kumagai, I test it, and it works well. The following is the results. in virtual machine(memory 2G): with empty memory: version | num-threads | time(sec) ----------------+---------------+------------- devel | 0 | 12.76 devel | 1 | 19.29 devel | 2 | 11.56 + this patch | 0 | 12.85 + this patch | 1 | 5.61 + this patch | 2 | 2.68 with full memory: version | num-threads | time(sec) ----------------+---------------+------------- devel | 0 | 51.18 devel | 1 | 57.82 devel | 2 | 41.54 + this patch | 0 | 49.25 + this patch | 1 | 44.80 + this patch | 2 | 33.87 in real machine(memory 16G): with empty memory: version | num-threads | time(sec) ----------------+---------------+------------- devel | 0 | 86.12 devel | 1 | 222.37 devel | 8 | 81.50 devel | 16 | 98.44 + this patch | 0 | 86.07 + this patch | 1 | 84.33 + this patch | 8 | 14.95 + this patch | 16 | 13.96 with full memory: version | num-threads | time(sec) ----------------+---------------+------------- devel | 0 | 540.89 devel | 1 | 715.25 devel | 8 | 132.54 devel | 16 | 112.89 + this patch | 0 | 542.79 + this patch | 1 | 538.22 + this patch | 8 | 108.28 + this patch | 16 | 107.83 -- Thanks Zhou On 10/14/2015 01:24 PM, Atsushi Kumagai wrote: > Hello, > > I have improved the performance issue of parallel compression > which we faced in: > > http://lists.infradead.org/pipermail/kexec/2015-July/014137.html > > The cause of the issue is that compress2() calls malloc() and free() > for a temp buffer in each call, it can cause many page faults since > makedumpfile has to call compress2() for each page. > > It's easy to avoid the issue, just divide compress2() into three > functions as initialization part, compression part and finalization > part. Then we don't need to call the initialization function and the > finalization function for each page. > > In order to benchmark, I measured the execution time and the number of > page faults by *perf stat -e page-faults* on the current devel branch(v1.5.8+). > > The result is here: > > CPU: Dual-Core AMD Opteron(tm) Processor 2218 @ 2.6GHz (4 cores) > Memory: 5GB > zlib: 1.2.3-29 > glibc: 2.12-1.132 > > version | num-threads | time(sec) | page-faults > ----------------+---------------+-------------+------------------ > devel | 1 | 133.96 | 21,801,120 > devel | 3 | 87.25 | 21,801,150 > + this patch | 1 | 47.80 | 1,036,408 > + this patch | 3 | 39.14 | 1,036,478 > > > Thanks > Atsushi Kumagai > > > From: Atsushi Kumagai <ats-kumagai at wm.jp.nec.com> > Date: Thu, 8 Oct 2015 15:06:08 +0900 > Subject: [PATCH] Improve performance for parallel compression with zlib. > > compress2() allocates a buffer, compresses a input data and > deallocates the buffer in each call. makedumpfile has to call > compress2() for each page, it can cause big performance > degradation due to many page faults. This issue will be > especially apparent in the case of multi thread compression > since per-thread arena is easy to be grown and trimmed compared > with main arena. > > Fortunately, the zlib functions called in compress2() are global, > it's easy to extract the allocation and deallocation part from > compress2(). > > Signed-off-by: Atsushi Kumagai <ats-kumagai at wm.jp.nec.com> > --- > makedumpfile.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > makedumpfile.h | 4 ++++ > 2 files changed, 60 insertions(+), 1 deletion(-) > > diff --git a/makedumpfile.c b/makedumpfile.c > index 06c8baf..fa0b779 100644 > --- a/makedumpfile.c > +++ b/makedumpfile.c > @@ -25,6 +25,7 @@ > #include <sys/time.h> > #include <limits.h> > #include <assert.h> > +#include <zlib.h> > > struct symbol_table symbol_table; > struct size_table size_table; > @@ -3538,6 +3539,11 @@ initial_for_parallel() > MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0; > MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0; > > + if (initialize_zlib(&ZLIB_STREAM_PARALLEL(i), Z_BEST_SPEED) == FALSE) { > + ERRMSG("zlib initialization failed.\n"); > + return FALSE; > + } > + > #ifdef USELZO > if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) { > MSG("Can't allocate memory for the working memory. %s\n", > @@ -3628,6 +3634,7 @@ free_for_parallel() > > free(MMAP_CACHE_PARALLEL(i)); > } > + finalize_zlib(&ZLIB_STREAM_PARALLEL(i)); > #ifdef USELZO > if (WRKMEM_PARALLEL(i) != NULL) > free(WRKMEM_PARALLEL(i)); > @@ -7017,6 +7024,53 @@ write_kdump_page(struct cache_data *cd_header, struct cache_data *cd_page, > return TRUE; > } > > +int initialize_zlib(z_stream *stream, int level) > +{ > + int err; > + > + stream->zalloc = (alloc_func)Z_NULL; > + stream->zfree = (free_func)Z_NULL; > + stream->opaque = (voidpf)Z_NULL; > + > + err = deflateInit(stream, level); > + if (err != Z_OK) { > + ERRMSG("deflateInit failed: %s\n", zError(err)); > + return FALSE; > + } > + return TRUE; > +} > + > +int compress_mdf (z_stream *stream, Bytef *dest, uLongf *destLen, > + const Bytef *source, uLong sourceLen, int level) > +{ > + int err; > + stream->next_in = (Bytef*)source; > + stream->avail_in = (uInt)sourceLen; > + stream->next_out = dest; > + stream->avail_out = (uInt)*destLen; > + if ((uLong)stream->avail_out != *destLen) > + return Z_BUF_ERROR; > + > + err = deflate(stream, Z_FINISH); > + > + if (err != Z_STREAM_END) { > + deflateReset(stream); > + return err == Z_OK ? Z_BUF_ERROR : err; > + } > + *destLen = stream->total_out; > + > + err = deflateReset(stream); > + return err; > +} > + > +int finalize_zlib(z_stream *stream) > +{ > + int err; > + err = deflateEnd(stream); > + > + return err; > +} > + > void * > kdump_thread_function_cyclic(void *arg) { > void *retval = PTHREAD_FAIL; > @@ -7035,6 +7089,7 @@ kdump_thread_function_cyclic(void *arg) { > struct mmap_cache *mmap_cache = > MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num); > unsigned long size_out; > + z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num); > #ifdef USELZO > lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num); > #endif > @@ -7135,7 +7190,7 @@ kdump_thread_function_cyclic(void *arg) { > size_out = kdump_thread_args->len_buf_out; > if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB) > && ((size_out = kdump_thread_args->len_buf_out), > - compress2(buf_out, &size_out, buf, > + compress_mdf(stream, buf_out, &size_out, buf, > info->page_size, > Z_BEST_SPEED) == Z_OK) > && (size_out < info->page_size)) { > diff --git a/makedumpfile.h b/makedumpfile.h > index 0bd6425..cb8f0f3 100644 > --- a/makedumpfile.h > +++ b/makedumpfile.h > @@ -438,6 +438,7 @@ do { \ > #define BUF_PARALLEL(i) info->parallel_info[i].buf > #define BUF_OUT_PARALLEL(i) info->parallel_info[i].buf_out > #define MMAP_CACHE_PARALLEL(i) info->parallel_info[i].mmap_cache > +#define ZLIB_STREAM_PARALLEL(i) info->parallel_info[i].zlib_stream > #ifdef USELZO > #define WRKMEM_PARALLEL(i) info->parallel_info[i].wrkmem > #endif > @@ -1050,6 +1051,7 @@ struct parallel_info { > unsigned char *buf; > unsigned char *buf_out; > struct mmap_cache *mmap_cache; > + z_stream zlib_stream; > #ifdef USELZO > lzo_bytep wrkmem; > #endif > @@ -2051,5 +2053,7 @@ int initial_xen(void); > unsigned long long get_free_memory_size(void); > int calculate_cyclic_buffer_size(void); > int prepare_splitblock_table(void); > +int initialize_zlib(z_stream *stream, int level); > +int finalize_zlib(z_stream *stream); > > #endif /* MAKEDUMPFILE_H */ >