[PATCH] index-pack: avoid false sharing on multicore systems

Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> · Fri, 27 Apr 2012 23:52:41 +0700

thread_data[] is frequently accessed by all threads and it's
packed tight. If two threads running on two different processors,
updating one "struct thread_local" may invalidate the cache line that
also contains struct thread_local of the other thread, leading to
cache misses (aka false sharing [1])

Pad it properly. Make sure each "struct thread_local" is on a separate
cache line.

This only works with real processors though. Multithread on single
processor may be harmed by this change. Keep the tight layout in that
case.

[1] http://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads/

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 My laptop has one physical core with hyperthread, so this patch does
 no good for me. I won't have access to a multicore system until the
 middle of next week, so I'm not sure if it actually helps at the
 moment.

 builtin/index-pack.c |   37 +++++++++++++++++++++++++++++++++----
 1 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c1c3c81..598b18e 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -93,7 +93,8 @@ static int input_fd, output_fd, pack_fd;
 
 #ifndef NO_PTHREADS
 
-static struct thread_local *thread_data;
+static struct thread_local **thread_data;
+static char *thread_data_buffer;
 
 static pthread_mutex_t read_mutex;
 #define read_lock()		lock_mutex(&read_mutex)
@@ -126,11 +127,38 @@ static inline void unlock_mutex(pthread_mutex_t *mutex)
  */
 static void init_thread(void)
 {
+	int i, stripe;
+
 	init_recursive_mutex(&read_mutex);
 	pthread_mutex_init(&counter_mutex, NULL);
 	pthread_mutex_init(&work_mutex, NULL);
 	pthread_key_create(&key, NULL);
 	thread_data = xcalloc(nr_threads, sizeof(*thread_data));
+
+
+	if (online_cpus() && !getenv("GIT_ONE_CPU_CORE")) {
+		/*
+		 * A typical cache line is 64 byte long. Avoid false
+		 * sharing by putting thread data of each thread on a
+		 * separate cache line
+		 */
+		stripe = 64;
+		assert(sizeof(struct thread_local) <= stripe);
+		thread_data_buffer = xmalloc((nr_threads + 1) * stripe);
+#define ALIGN(x, stripe) (((x) + (stripe) - 1) & ~((stripe) - 1))
+		thread_data[0] = (struct thread_local *)ALIGN((unsigned int)thread_data_buffer, stripe);
+	} else {
+		/*
+		 * On the other hand, on single physical cpu, padding
+		 * wastes cache for nothing. Don't do it.
+		 */
+		stripe = sizeof(struct thread_local);
+		thread_data_buffer = xmalloc(nr_threads * stripe);
+		thread_data[0] = (struct thread_local *)thread_data_buffer;
+	}
+	memset(thread_data[0], 0, stripe * nr_threads);
+	for (i = 1; i < nr_threads; i++)
+		thread_data[i] = (struct thread_local*)((char*)(thread_data[0]) + i * stripe);
 	threads_active = 1;
 }
 
@@ -143,6 +171,7 @@ static void cleanup_thread(void)
 	pthread_mutex_destroy(&counter_mutex);
 	pthread_mutex_destroy(&work_mutex);
 	pthread_key_delete(key);
+	free(thread_data_buffer);
 	free(thread_data);
 }
 
@@ -890,13 +919,13 @@ static void parse_pack_objects(unsigned char *sha1)
 	if (nr_threads > 1 || getenv("GIT_FORCE_THREADS")) {
 		init_thread();
 		for (i = 0; i < nr_threads; i++) {
-			int ret = pthread_create(&thread_data[i].thread, NULL,
-						 threaded_second_pass, thread_data + i);
+			int ret = pthread_create(&thread_data[i]->thread, NULL,
+						 threaded_second_pass, thread_data[i]);
 			if (ret)
 				die("unable to create thread: %s", strerror(ret));
 		}
 		for (i = 0; i < nr_threads; i++)
-			pthread_join(thread_data[i].thread, NULL);
+			pthread_join(thread_data[i]->thread, NULL);
 
 		cleanup_thread();
 		return;
-- 
1.7.8.36.g69ee2

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html