thread_data[] is frequently accessed by all threads and it's packed tight. If two threads running on two different processors, updating one "struct thread_local" may invalidate the cache line that also contains struct thread_local of the other thread, leading to cache misses (aka false sharing [1]) Pad it properly. Make sure each "struct thread_local" is on a separate cache line. This only works with real processors though. Multithread on single processor may be harmed by this change. Keep the tight layout in that case. [1] http://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads/ Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> --- My laptop has one physical core with hyperthread, so this patch does no good for me. I won't have access to a multicore system until the middle of next week, so I'm not sure if it actually helps at the moment. builtin/index-pack.c | 37 +++++++++++++++++++++++++++++++++---- 1 files changed, 33 insertions(+), 4 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index c1c3c81..598b18e 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -93,7 +93,8 @@ static int input_fd, output_fd, pack_fd; #ifndef NO_PTHREADS -static struct thread_local *thread_data; +static struct thread_local **thread_data; +static char *thread_data_buffer; static pthread_mutex_t read_mutex; #define read_lock() lock_mutex(&read_mutex) @@ -126,11 +127,38 @@ static inline void unlock_mutex(pthread_mutex_t *mutex) */ static void init_thread(void) { + int i, stripe; + init_recursive_mutex(&read_mutex); pthread_mutex_init(&counter_mutex, NULL); pthread_mutex_init(&work_mutex, NULL); pthread_key_create(&key, NULL); thread_data = xcalloc(nr_threads, sizeof(*thread_data)); + + + if (online_cpus() && !getenv("GIT_ONE_CPU_CORE")) { + /* + * A typical cache line is 64 byte long. Avoid false + * sharing by putting thread data of each thread on a + * separate cache line + */ + stripe = 64; + assert(sizeof(struct thread_local) <= stripe); + thread_data_buffer = xmalloc((nr_threads + 1) * stripe); +#define ALIGN(x, stripe) (((x) + (stripe) - 1) & ~((stripe) - 1)) + thread_data[0] = (struct thread_local *)ALIGN((unsigned int)thread_data_buffer, stripe); + } else { + /* + * On the other hand, on single physical cpu, padding + * wastes cache for nothing. Don't do it. + */ + stripe = sizeof(struct thread_local); + thread_data_buffer = xmalloc(nr_threads * stripe); + thread_data[0] = (struct thread_local *)thread_data_buffer; + } + memset(thread_data[0], 0, stripe * nr_threads); + for (i = 1; i < nr_threads; i++) + thread_data[i] = (struct thread_local*)((char*)(thread_data[0]) + i * stripe); threads_active = 1; } @@ -143,6 +171,7 @@ static void cleanup_thread(void) pthread_mutex_destroy(&counter_mutex); pthread_mutex_destroy(&work_mutex); pthread_key_delete(key); + free(thread_data_buffer); free(thread_data); } @@ -890,13 +919,13 @@ static void parse_pack_objects(unsigned char *sha1) if (nr_threads > 1 || getenv("GIT_FORCE_THREADS")) { init_thread(); for (i = 0; i < nr_threads; i++) { - int ret = pthread_create(&thread_data[i].thread, NULL, - threaded_second_pass, thread_data + i); + int ret = pthread_create(&thread_data[i]->thread, NULL, + threaded_second_pass, thread_data[i]); if (ret) die("unable to create thread: %s", strerror(ret)); } for (i = 0; i < nr_threads; i++) - pthread_join(thread_data[i].thread, NULL); + pthread_join(thread_data[i]->thread, NULL); cleanup_thread(); return; -- 1.7.8.36.g69ee2 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html