Hello Zhou, >Currently, num_threads means the producer number. The main thread >is not included. However, num_threads is specified by the option >"--num-threads". So this may make user confused why it still has >performance degradation when the value by "--num-threads" equals >the usable cpu number. > >I think it will be more nature to make "--num-threads" be producer >number + 1. In other words, "--num-threads" should include the >main thread. The most part of cpu time will be spent for compression while the main thread(Consumer) just does I/O work. If the usable cpu number is 2 and --num-thread is also 2, your patch reserves a whole cpu for the main thread and the compression will be done by just one cpu. Is that really the multi thread processing which we wanted ? If there are 4 usable cpus, I guess 4 producers with the main thread is faster than 3 producers with the main thread. However, according to my quick test, there were no significant differences between the two cases. (This was measured on a small VM(5GB) without your patch, the number of trials is 3, so just for information.) # makedumpfile -c --num-thread [3|4] | | time[s] CPUs | -num-threads | 1st | 2nd | 3rd -------+---------------+----------+----------+---------- 4 | 3 | 29.54 30.19 28.26 4 | 4 | 29.80 31.75 28.19 Then, I noticed that the cpu usage of the main thread is almost 100% while it doesn't need a lot of cpu resources in theory. Do you have any idea why the main thread require a lot of cpu resources ? If it's an unintended behavior, I believe the case of --num-thread=4 can be faster if the cpu usage of the main thread is reduced. Thanks, Atsushi Kumagai >--- > makedumpfile.c | 108 ++++++++++++++++++++++++++++----------------------------- > makedumpfile.h | 4 +-- > 2 files changed, 56 insertions(+), 56 deletions(-) > >diff --git a/makedumpfile.c b/makedumpfile.c >index 21784e8..19019a4 100644 >--- a/makedumpfile.c >+++ b/makedumpfile.c >@@ -1407,13 +1407,13 @@ open_dump_bitmap(void) > } > } > >- if (info->num_threads) { >+ if (info->num_producers) { > /* > * Reserve file descriptors of bitmap for creating dumpfiles > * parallelly, because a bitmap file will be unlinked just after > * this and it is not possible to open a bitmap file later. > */ >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if ((fd = open(info->name_bitmap, O_RDONLY)) < 0) { > ERRMSG("Can't open the bitmap file(%s). %s\n", > info->name_bitmap, strerror(errno)); >@@ -3495,9 +3495,9 @@ initialize_bitmap_memory(void) > } > > void >-initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int thread_num) >+initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int producer_num) > { >- bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(thread_num); >+ bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(producer_num); > bitmap->file_name = info->name_memory; > bitmap->no_block = -1; > memset(bitmap->buf, 0, BUFSIZE_BITMAP); >@@ -3529,24 +3529,24 @@ initial_for_parallel() > /* > * allocate memory for threads > */ >- if ((info->threads = malloc(sizeof(pthread_t *) * info->num_threads)) >+ if ((info->threads = malloc(sizeof(pthread_t *) * info->num_producers)) > == NULL) { > MSG("Can't allocate memory for threads. %s\n", > strerror(errno)); > return FALSE; > } >- memset(info->threads, 0, sizeof(pthread_t *) * info->num_threads); >+ memset(info->threads, 0, sizeof(pthread_t *) * info->num_producers); > > if ((info->kdump_thread_args = >- malloc(sizeof(struct thread_args) * info->num_threads)) >+ malloc(sizeof(struct thread_args) * info->num_producers)) > == NULL) { > MSG("Can't allocate memory for arguments of threads. %s\n", > strerror(errno)); > return FALSE; > } >- memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_threads); >+ memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_producers); > >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if ((info->threads[i] = malloc(sizeof(pthread_t))) == NULL) { > MSG("Can't allocate memory for thread %d. %s", > i, strerror(errno)); >@@ -3592,7 +3592,7 @@ initial_for_parallel() > #endif > } > >- info->num_buffers = PAGE_DATA_NUM * info->num_threads; >+ info->num_buffers = PAGE_DATA_NUM * info->num_producers; > > /* > * allocate memory for page_data >@@ -3615,17 +3615,17 @@ initial_for_parallel() > } > > /* >- * initial page_flag for each thread >+ * initial page_flag for each producer thread > */ >- if ((info->page_flag_buf = malloc(sizeof(void *) * info->num_threads)) >+ if ((info->page_flag_buf = malloc(sizeof(void *) * info->num_producers)) > == NULL) { > MSG("Can't allocate memory for page_flag_buf. %s\n", > strerror(errno)); > return FALSE; > } >- memset(info->page_flag_buf, 0, sizeof(void *) * info->num_threads); >+ memset(info->page_flag_buf, 0, sizeof(void *) * info->num_producers); > >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if ((info->page_flag_buf[i] = calloc(1, sizeof(struct page_flag))) == NULL) { > MSG("Can't allocate memory for page_flag. %s\n", > strerror(errno)); >@@ -3645,9 +3645,9 @@ initial_for_parallel() > } > > /* >- * initial fd_memory for threads >+ * initial fd_memory for producer threads > */ >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if ((FD_MEMORY_PARALLEL(i) = open(info->name_memory, O_RDONLY)) > < 0) { > ERRMSG("Can't open the dump memory(%s). %s\n", >@@ -3673,7 +3673,7 @@ free_for_parallel() > struct page_flag *current; > > if (info->threads != NULL) { >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if (info->threads[i] != NULL) > free(info->threads[i]); > >@@ -3714,7 +3714,7 @@ free_for_parallel() > } > > if (info->page_flag_buf != NULL) { >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > for (j = 0; j < PAGE_FLAG_NUM; j++) { > if (info->page_flag_buf[i] != NULL) { > current = info->page_flag_buf[i]; >@@ -3729,7 +3729,7 @@ free_for_parallel() > if (info->parallel_info == NULL) > return; > >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if (FD_MEMORY_PARALLEL(i) > 0) > close(FD_MEMORY_PARALLEL(i)); > >@@ -3936,7 +3936,7 @@ out: > DEBUG_MSG("Buffer size for the cyclic mode: %ld\n", info->bufsize_cyclic); > } > >- if (info->num_threads) { >+ if (info->num_producers) { > if (is_xen_memory()) { > MSG("'--num-threads' option is disable,\n"); > MSG("because %s is Xen's memory core image.\n", >@@ -4066,9 +4066,9 @@ initialize_2nd_bitmap(struct dump_bitmap *bitmap) > } > > void >-initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int thread_num) >+initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int producer_num) > { >- bitmap->fd = FD_BITMAP_PARALLEL(thread_num); >+ bitmap->fd = FD_BITMAP_PARALLEL(producer_num); > bitmap->file_name = info->name_bitmap; > bitmap->no_block = -1; > memset(bitmap->buf, 0, BUFSIZE_BITMAP); >@@ -7558,7 +7558,7 @@ kdump_thread_function_cyclic(void *arg) { > volatile struct page_flag *page_flag_buf = kdump_thread_args->page_flag_buf; > struct cycle *cycle = kdump_thread_args->cycle; > mdf_pfn_t pfn = cycle->start_pfn; >- int index = kdump_thread_args->thread_num; >+ int index = kdump_thread_args->producer_num; > int buf_ready; > int dumpable; > int fd_memory = 0; >@@ -7566,21 +7566,21 @@ kdump_thread_function_cyclic(void *arg) { > struct dump_bitmap bitmap_memory_parallel = {0}; > unsigned char *buf = NULL, *buf_out = NULL; > struct mmap_cache *mmap_cache = >- MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num); >+ MMAP_CACHE_PARALLEL(kdump_thread_args->producer_num); > unsigned long size_out; >- z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num); >+ z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->producer_num); > #ifdef USELZO >- lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num); >+ lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->producer_num); > #endif > #ifdef USESNAPPY > unsigned long len_buf_out_snappy = > snappy_max_compressed_length(info->page_size); > #endif > >- buf = BUF_PARALLEL(kdump_thread_args->thread_num); >- buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num); >+ buf = BUF_PARALLEL(kdump_thread_args->producer_num); >+ buf_out = BUF_OUT_PARALLEL(kdump_thread_args->producer_num); > >- fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num); >+ fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->producer_num); > > if (info->fd_bitmap) { > bitmap_parallel.buf = malloc(BUFSIZE_BITMAP); >@@ -7590,7 +7590,7 @@ kdump_thread_function_cyclic(void *arg) { > goto fail; > } > initialize_2nd_bitmap_parallel(&bitmap_parallel, >- kdump_thread_args->thread_num); >+ kdump_thread_args->producer_num); > } > > if (info->flag_refiltering) { >@@ -7601,7 +7601,7 @@ kdump_thread_function_cyclic(void *arg) { > goto fail; > } > initialize_bitmap_memory_parallel(&bitmap_memory_parallel, >- kdump_thread_args->thread_num); >+ kdump_thread_args->producer_num); > } > > /* >@@ -7802,8 +7802,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, > for (i = 0; i < page_buf_num; i++) > page_data_buf[i].used = FALSE; > >- for (i = 0; i < info->num_threads; i++) { >- kdump_thread_args[i].thread_num = i; >+ for (i = 0; i < info->num_producers; i++) { >+ kdump_thread_args[i].producer_num = i; > kdump_thread_args[i].len_buf_out = len_buf_out; > kdump_thread_args[i].page_data_buf = page_data_buf; > kdump_thread_args[i].page_flag_buf = info->page_flag_buf[i]; >@@ -7843,13 +7843,13 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, > * consuming is used for recording in which thread the pfn is the smallest. > * current_pfn is used for recording the value of pfn when checking the pfn. > */ >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if (info->page_flag_buf[i]->ready == FLAG_UNUSED) > continue; > temp_pfn = info->page_flag_buf[i]->pfn; > > /* >- * count how many threads have reached the end. >+ * count how many producer threads have reached the end. > */ > if (temp_pfn >= end_pfn) { > info->page_flag_buf[i]->ready = FLAG_UNUSED; >@@ -7865,9 +7865,9 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, > } > > /* >- * If all the threads have reached the end, we will finish writing. >+ * If all producer threads have reached the end, we will finish writing. > */ >- if (end_count >= info->num_threads) >+ if (end_count >= info->num_producers) > goto finish; > > /* >@@ -7930,7 +7930,7 @@ finish: > > out: > if (threads != NULL) { >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if (threads[i] != NULL) { > res = pthread_cancel(*threads[i]); > if (res != 0 && res != ESRCH) >@@ -7939,7 +7939,7 @@ out: > } > } > >- for (i = 0; i < info->num_threads; i++) { >+ for (i = 0; i < info->num_producers; i++) { > if (threads[i] != NULL) { > res = pthread_join(*threads[i], &thread_result); > if (res != 0) >@@ -8553,7 +8553,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d > if (!write_kdump_bitmap2(&cycle)) > return FALSE; > >- if (info->num_threads) { >+ if (info->num_producers) { > if (!write_kdump_pages_parallel_cyclic(cd_header, > cd_page, &pd_zero, > &offset_data, &cycle)) >@@ -10531,7 +10531,7 @@ check_param_for_creating_dumpfile(int argc, char *argv[]) > if (info->flag_sadump_diskset && !sadump_is_supported_arch()) > return FALSE; > >- if (info->num_threads) { >+ if (info->num_producers) { > if (info->flag_split) { > MSG("--num-threads cannot used with --split.\n"); > return FALSE; >@@ -10607,16 +10607,16 @@ check_param_for_creating_dumpfile(int argc, char *argv[]) > } else > return FALSE; > >- if (info->num_threads) { >+ if (info->num_producers) { > if ((info->parallel_info = >- malloc(sizeof(parallel_info_t) * info->num_threads)) >+ malloc(sizeof(parallel_info_t) * info->num_producers)) > == NULL) { > MSG("Can't allocate memory for parallel_info.\n"); > return FALSE; > } > > memset(info->parallel_info, 0, sizeof(parallel_info_t) >- * info->num_threads); >+ * info->num_producers); > } > > return TRUE; >@@ -10721,20 +10721,20 @@ calculate_cyclic_buffer_size(void) { > limit_size = get_free_memory_size() * 0.6; > > /* >- * Recalculate the limit_size according to num_threads. >- * And reset num_threads if there is not enough memory. >+ * Recalculate the limit_size according to num_producers. >+ * And reset --num-threads if there is not enough memory. > */ >- if (info->num_threads > 0) { >+ if (info->num_producers > 0) { > if (limit_size <= maximum_size) { > MSG("There isn't enough memory for multi-threads.\n"); >- info->num_threads = 0; >+ info->num_producers = 0; > } >- else if ((limit_size - maximum_size) / info->num_threads < THREAD_REGION) { >- MSG("There isn't enough memory for %d threads.\n", info->num_threads); >- info->num_threads = (limit_size - maximum_size) / THREAD_REGION; >- MSG("--num_threads is set to %d.\n", info->num_threads); >+ else if ((limit_size - maximum_size) / info->num_producers < THREAD_REGION) { >+ MSG("There isn't enough memory for %d threads.\n", info->num_producers + 1); >+ info->num_producers = (limit_size - maximum_size) / THREAD_REGION; >+ MSG("--num-threads is set to %d.\n", info->num_producers + 1); > >- limit_size = limit_size - THREAD_REGION * info->num_threads; >+ limit_size = limit_size - THREAD_REGION * info->num_producers; > } > } > >@@ -11103,7 +11103,7 @@ main(int argc, char *argv[]) > info->working_dir = optarg; > break; > case OPT_NUM_THREADS: >- info->num_threads = MAX(atoi(optarg), 0); >+ info->num_producers = MAX(atoi(optarg), 1) - 1; > break; > case '?': > MSG("Commandline parameter is invalid.\n"); >diff --git a/makedumpfile.h b/makedumpfile.h >index 533e5b8..6a0f51e 100644 >--- a/makedumpfile.h >+++ b/makedumpfile.h >@@ -1004,7 +1004,7 @@ struct page_data > }; > > struct thread_args { >- int thread_num; >+ int producer_num; > unsigned long len_buf_out; > struct cycle *cycle; > struct page_data *page_data_buf; >@@ -1294,7 +1294,7 @@ struct DumpInfo { > /* > * for parallel process > */ >- int num_threads; >+ int num_producers; > int num_buffers; > pthread_t **threads; > struct thread_args *kdump_thread_args; >-- >1.8.3.1 > >