On 08/04/2016 08:50 AM, Atsushi Kumagai wrote: > Hello Zhou, > >> Currently, num_threads means the producer number. The main thread >> is not included. However, num_threads is specified by the option >> "--num-threads". So this may make user confused why it still has >> performance degradation when the value by "--num-threads" equals >> the usable cpu number. >> >> I think it will be more nature to make "--num-threads" be producer >> number + 1. In other words, "--num-threads" should include the >> main thread. > > The most part of cpu time will be spent for compression while > the main thread(Consumer) just does I/O work. > If the usable cpu number is 2 and --num-thread is also 2, your > patch reserves a whole cpu for the main thread and the compression > will be done by just one cpu. Is that really the multi thread > processing which we wanted ? > > If there are 4 usable cpus, I guess 4 producers with the main thread > is faster than 3 producers with the main thread. > > However, according to my quick test, there were no significant > differences between the two cases. > (This was measured on a small VM(5GB) without your patch, the number > of trials is 3, so just for information.) > > # makedumpfile -c --num-thread [3|4] > > | | time[s] > CPUs | -num-threads | 1st | 2nd | 3rd > -------+---------------+----------+----------+---------- > 4 | 3 | 29.54 30.19 28.26 > 4 | 4 | 29.80 31.75 28.19 > > Then, I noticed that the cpu usage of the main thread is almost 100% > while it doesn't need a lot of cpu resources in theory. Do you have > any idea why the main thread require a lot of cpu resources ? Yes, it shouldn't take so much cpu time. It really need to be improved. I'll think about it. -- Thanks Zhou > If it's an unintended behavior, I believe the case of --num-thread=4 > can be faster if the cpu usage of the main thread is reduced. > > > Thanks, > Atsushi Kumagai > >> --- >> makedumpfile.c | 108 ++++++++++++++++++++++++++++----------------------------- >> makedumpfile.h | 4 +-- >> 2 files changed, 56 insertions(+), 56 deletions(-) >> >> diff --git a/makedumpfile.c b/makedumpfile.c >> index 21784e8..19019a4 100644 >> --- a/makedumpfile.c >> +++ b/makedumpfile.c >> @@ -1407,13 +1407,13 @@ open_dump_bitmap(void) >> } >> } >> >> - if (info->num_threads) { >> + if (info->num_producers) { >> /* >> * Reserve file descriptors of bitmap for creating dumpfiles >> * parallelly, because a bitmap file will be unlinked just after >> * this and it is not possible to open a bitmap file later. >> */ >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if ((fd = open(info->name_bitmap, O_RDONLY)) < 0) { >> ERRMSG("Can't open the bitmap file(%s). %s\n", >> info->name_bitmap, strerror(errno)); >> @@ -3495,9 +3495,9 @@ initialize_bitmap_memory(void) >> } >> >> void >> -initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int thread_num) >> +initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int producer_num) >> { >> - bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(thread_num); >> + bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(producer_num); >> bitmap->file_name = info->name_memory; >> bitmap->no_block = -1; >> memset(bitmap->buf, 0, BUFSIZE_BITMAP); >> @@ -3529,24 +3529,24 @@ initial_for_parallel() >> /* >> * allocate memory for threads >> */ >> - if ((info->threads = malloc(sizeof(pthread_t *) * info->num_threads)) >> + if ((info->threads = malloc(sizeof(pthread_t *) * info->num_producers)) >> == NULL) { >> MSG("Can't allocate memory for threads. %s\n", >> strerror(errno)); >> return FALSE; >> } >> - memset(info->threads, 0, sizeof(pthread_t *) * info->num_threads); >> + memset(info->threads, 0, sizeof(pthread_t *) * info->num_producers); >> >> if ((info->kdump_thread_args = >> - malloc(sizeof(struct thread_args) * info->num_threads)) >> + malloc(sizeof(struct thread_args) * info->num_producers)) >> == NULL) { >> MSG("Can't allocate memory for arguments of threads. %s\n", >> strerror(errno)); >> return FALSE; >> } >> - memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_threads); >> + memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_producers); >> >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if ((info->threads[i] = malloc(sizeof(pthread_t))) == NULL) { >> MSG("Can't allocate memory for thread %d. %s", >> i, strerror(errno)); >> @@ -3592,7 +3592,7 @@ initial_for_parallel() >> #endif >> } >> >> - info->num_buffers = PAGE_DATA_NUM * info->num_threads; >> + info->num_buffers = PAGE_DATA_NUM * info->num_producers; >> >> /* >> * allocate memory for page_data >> @@ -3615,17 +3615,17 @@ initial_for_parallel() >> } >> >> /* >> - * initial page_flag for each thread >> + * initial page_flag for each producer thread >> */ >> - if ((info->page_flag_buf = malloc(sizeof(void *) * info->num_threads)) >> + if ((info->page_flag_buf = malloc(sizeof(void *) * info->num_producers)) >> == NULL) { >> MSG("Can't allocate memory for page_flag_buf. %s\n", >> strerror(errno)); >> return FALSE; >> } >> - memset(info->page_flag_buf, 0, sizeof(void *) * info->num_threads); >> + memset(info->page_flag_buf, 0, sizeof(void *) * info->num_producers); >> >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if ((info->page_flag_buf[i] = calloc(1, sizeof(struct page_flag))) == NULL) { >> MSG("Can't allocate memory for page_flag. %s\n", >> strerror(errno)); >> @@ -3645,9 +3645,9 @@ initial_for_parallel() >> } >> >> /* >> - * initial fd_memory for threads >> + * initial fd_memory for producer threads >> */ >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if ((FD_MEMORY_PARALLEL(i) = open(info->name_memory, O_RDONLY)) >> < 0) { >> ERRMSG("Can't open the dump memory(%s). %s\n", >> @@ -3673,7 +3673,7 @@ free_for_parallel() >> struct page_flag *current; >> >> if (info->threads != NULL) { >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if (info->threads[i] != NULL) >> free(info->threads[i]); >> >> @@ -3714,7 +3714,7 @@ free_for_parallel() >> } >> >> if (info->page_flag_buf != NULL) { >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> for (j = 0; j < PAGE_FLAG_NUM; j++) { >> if (info->page_flag_buf[i] != NULL) { >> current = info->page_flag_buf[i]; >> @@ -3729,7 +3729,7 @@ free_for_parallel() >> if (info->parallel_info == NULL) >> return; >> >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if (FD_MEMORY_PARALLEL(i) > 0) >> close(FD_MEMORY_PARALLEL(i)); >> >> @@ -3936,7 +3936,7 @@ out: >> DEBUG_MSG("Buffer size for the cyclic mode: %ld\n", info->bufsize_cyclic); >> } >> >> - if (info->num_threads) { >> + if (info->num_producers) { >> if (is_xen_memory()) { >> MSG("'--num-threads' option is disable,\n"); >> MSG("because %s is Xen's memory core image.\n", >> @@ -4066,9 +4066,9 @@ initialize_2nd_bitmap(struct dump_bitmap *bitmap) >> } >> >> void >> -initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int thread_num) >> +initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int producer_num) >> { >> - bitmap->fd = FD_BITMAP_PARALLEL(thread_num); >> + bitmap->fd = FD_BITMAP_PARALLEL(producer_num); >> bitmap->file_name = info->name_bitmap; >> bitmap->no_block = -1; >> memset(bitmap->buf, 0, BUFSIZE_BITMAP); >> @@ -7558,7 +7558,7 @@ kdump_thread_function_cyclic(void *arg) { >> volatile struct page_flag *page_flag_buf = kdump_thread_args->page_flag_buf; >> struct cycle *cycle = kdump_thread_args->cycle; >> mdf_pfn_t pfn = cycle->start_pfn; >> - int index = kdump_thread_args->thread_num; >> + int index = kdump_thread_args->producer_num; >> int buf_ready; >> int dumpable; >> int fd_memory = 0; >> @@ -7566,21 +7566,21 @@ kdump_thread_function_cyclic(void *arg) { >> struct dump_bitmap bitmap_memory_parallel = {0}; >> unsigned char *buf = NULL, *buf_out = NULL; >> struct mmap_cache *mmap_cache = >> - MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num); >> + MMAP_CACHE_PARALLEL(kdump_thread_args->producer_num); >> unsigned long size_out; >> - z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num); >> + z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->producer_num); >> #ifdef USELZO >> - lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num); >> + lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->producer_num); >> #endif >> #ifdef USESNAPPY >> unsigned long len_buf_out_snappy = >> snappy_max_compressed_length(info->page_size); >> #endif >> >> - buf = BUF_PARALLEL(kdump_thread_args->thread_num); >> - buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num); >> + buf = BUF_PARALLEL(kdump_thread_args->producer_num); >> + buf_out = BUF_OUT_PARALLEL(kdump_thread_args->producer_num); >> >> - fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num); >> + fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->producer_num); >> >> if (info->fd_bitmap) { >> bitmap_parallel.buf = malloc(BUFSIZE_BITMAP); >> @@ -7590,7 +7590,7 @@ kdump_thread_function_cyclic(void *arg) { >> goto fail; >> } >> initialize_2nd_bitmap_parallel(&bitmap_parallel, >> - kdump_thread_args->thread_num); >> + kdump_thread_args->producer_num); >> } >> >> if (info->flag_refiltering) { >> @@ -7601,7 +7601,7 @@ kdump_thread_function_cyclic(void *arg) { >> goto fail; >> } >> initialize_bitmap_memory_parallel(&bitmap_memory_parallel, >> - kdump_thread_args->thread_num); >> + kdump_thread_args->producer_num); >> } >> >> /* >> @@ -7802,8 +7802,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, >> for (i = 0; i < page_buf_num; i++) >> page_data_buf[i].used = FALSE; >> >> - for (i = 0; i < info->num_threads; i++) { >> - kdump_thread_args[i].thread_num = i; >> + for (i = 0; i < info->num_producers; i++) { >> + kdump_thread_args[i].producer_num = i; >> kdump_thread_args[i].len_buf_out = len_buf_out; >> kdump_thread_args[i].page_data_buf = page_data_buf; >> kdump_thread_args[i].page_flag_buf = info->page_flag_buf[i]; >> @@ -7843,13 +7843,13 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, >> * consuming is used for recording in which thread the pfn is the smallest. >> * current_pfn is used for recording the value of pfn when checking the pfn. >> */ >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if (info->page_flag_buf[i]->ready == FLAG_UNUSED) >> continue; >> temp_pfn = info->page_flag_buf[i]->pfn; >> >> /* >> - * count how many threads have reached the end. >> + * count how many producer threads have reached the end. >> */ >> if (temp_pfn >= end_pfn) { >> info->page_flag_buf[i]->ready = FLAG_UNUSED; >> @@ -7865,9 +7865,9 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, >> } >> >> /* >> - * If all the threads have reached the end, we will finish writing. >> + * If all producer threads have reached the end, we will finish writing. >> */ >> - if (end_count >= info->num_threads) >> + if (end_count >= info->num_producers) >> goto finish; >> >> /* >> @@ -7930,7 +7930,7 @@ finish: >> >> out: >> if (threads != NULL) { >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if (threads[i] != NULL) { >> res = pthread_cancel(*threads[i]); >> if (res != 0 && res != ESRCH) >> @@ -7939,7 +7939,7 @@ out: >> } >> } >> >> - for (i = 0; i < info->num_threads; i++) { >> + for (i = 0; i < info->num_producers; i++) { >> if (threads[i] != NULL) { >> res = pthread_join(*threads[i], &thread_result); >> if (res != 0) >> @@ -8553,7 +8553,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d >> if (!write_kdump_bitmap2(&cycle)) >> return FALSE; >> >> - if (info->num_threads) { >> + if (info->num_producers) { >> if (!write_kdump_pages_parallel_cyclic(cd_header, >> cd_page, &pd_zero, >> &offset_data, &cycle)) >> @@ -10531,7 +10531,7 @@ check_param_for_creating_dumpfile(int argc, char *argv[]) >> if (info->flag_sadump_diskset && !sadump_is_supported_arch()) >> return FALSE; >> >> - if (info->num_threads) { >> + if (info->num_producers) { >> if (info->flag_split) { >> MSG("--num-threads cannot used with --split.\n"); >> return FALSE; >> @@ -10607,16 +10607,16 @@ check_param_for_creating_dumpfile(int argc, char *argv[]) >> } else >> return FALSE; >> >> - if (info->num_threads) { >> + if (info->num_producers) { >> if ((info->parallel_info = >> - malloc(sizeof(parallel_info_t) * info->num_threads)) >> + malloc(sizeof(parallel_info_t) * info->num_producers)) >> == NULL) { >> MSG("Can't allocate memory for parallel_info.\n"); >> return FALSE; >> } >> >> memset(info->parallel_info, 0, sizeof(parallel_info_t) >> - * info->num_threads); >> + * info->num_producers); >> } >> >> return TRUE; >> @@ -10721,20 +10721,20 @@ calculate_cyclic_buffer_size(void) { >> limit_size = get_free_memory_size() * 0.6; >> >> /* >> - * Recalculate the limit_size according to num_threads. >> - * And reset num_threads if there is not enough memory. >> + * Recalculate the limit_size according to num_producers. >> + * And reset --num-threads if there is not enough memory. >> */ >> - if (info->num_threads > 0) { >> + if (info->num_producers > 0) { >> if (limit_size <= maximum_size) { >> MSG("There isn't enough memory for multi-threads.\n"); >> - info->num_threads = 0; >> + info->num_producers = 0; >> } >> - else if ((limit_size - maximum_size) / info->num_threads < THREAD_REGION) { >> - MSG("There isn't enough memory for %d threads.\n", info->num_threads); >> - info->num_threads = (limit_size - maximum_size) / THREAD_REGION; >> - MSG("--num_threads is set to %d.\n", info->num_threads); >> + else if ((limit_size - maximum_size) / info->num_producers < THREAD_REGION) { >> + MSG("There isn't enough memory for %d threads.\n", info->num_producers + 1); >> + info->num_producers = (limit_size - maximum_size) / THREAD_REGION; >> + MSG("--num-threads is set to %d.\n", info->num_producers + 1); >> >> - limit_size = limit_size - THREAD_REGION * info->num_threads; >> + limit_size = limit_size - THREAD_REGION * info->num_producers; >> } >> } >> >> @@ -11103,7 +11103,7 @@ main(int argc, char *argv[]) >> info->working_dir = optarg; >> break; >> case OPT_NUM_THREADS: >> - info->num_threads = MAX(atoi(optarg), 0); >> + info->num_producers = MAX(atoi(optarg), 1) - 1; >> break; >> case '?': >> MSG("Commandline parameter is invalid.\n"); >> diff --git a/makedumpfile.h b/makedumpfile.h >> index 533e5b8..6a0f51e 100644 >> --- a/makedumpfile.h >> +++ b/makedumpfile.h >> @@ -1004,7 +1004,7 @@ struct page_data >> }; >> >> struct thread_args { >> - int thread_num; >> + int producer_num; >> unsigned long len_buf_out; >> struct cycle *cycle; >> struct page_data *page_data_buf; >> @@ -1294,7 +1294,7 @@ struct DumpInfo { >> /* >> * for parallel process >> */ >> - int num_threads; >> + int num_producers; >> int num_buffers; >> pthread_t **threads; >> struct thread_args *kdump_thread_args; >> -- >> 1.8.3.1 >> >>