[RFC PATCH] Fix quadratic performance in rewrite_one.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Parent commits are usually older than their children. Thus,
on each iteration of the loop in rewrite_one, add_parents_to_list
traverses all commits previously processed by the loop.
It performs very poorly in case of very long rewrite chains.

Signed-off-by: Alexander Gavrilov <angavrilov@xxxxxxxxx>
---
	
	While experimenting with a large Git repository (~10 years of history), I noticed that
	gitk (and git-log) is very slow when asked to show history for a recently created directory:

	$ time git log --no-color --pretty=oneline --parents --boundary -- A_5_Year_Old_Dir | wc
	1620    8042  166759

	real    0m29.969s
	user    0m28.419s
	sys     0m0.304s

	$ time git log --no-color --pretty=oneline --parents --boundary -- A_3_Year_Old_Dir | wc
	66     315    6599
	
	real    1m41.523s
	user    1m38.585s
	sys     0m0.549s

	$ time git log --no-color --pretty=oneline --parents --boundary -- A_1_Year_Old_Dir | wc
	18     108    1898
	
	real    3m33.023s
	user    3m28.817s
	sys     0m0.859s
	
	Profiling showed that most of the time is spent manipulating commit lists:
	
	%   cumulative   self              self     total           
	time   seconds   seconds    calls   s/call   s/call  name    
	98.91    206.21   206.21    76909     0.00     0.00  insert_by_date
	0.36    206.97     0.76   409268     0.00     0.00  find_pack_entry_one
	0.17    207.33     0.36   258208     0.00     0.00  unpack_object_header_gently
	
	After some research and debugging I narrowed it down to this loop in rewrite_one (revision.c):
	
		for (;;) {
			struct commit *p = *pp;
			...
				if (add_parents_to_list(revs, p, &revs->commits) < 0)
			...
			*pp = p->parents->item;
		}
	
	When git-log is called for a recently created directory, all history before its creation ends up
	being processed in this loop. Since add_parents_to_list traverses the list linearly to find the
	point of insertion, which in this case is always at the end of the list, it actually works
	in quadratical time.
	
	I made a fix for it by caching a pointer to the farthest added element. The performance gain
	is obvious, but as it's my first ever change in Git source code, I believe that there is a better
	solution, or, at least, implementation.
	
	$ time git log --no-color --pretty=oneline --parents --boundary -- A_1_Year_Old_Dir | wc
	18     108    1898
	
	real    0m4.101s
	user    0m3.897s
	sys     0m0.123s
	
	Alexander.


 revision.c |   34 ++++++++++++++++++++++++++++------
 1 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/revision.c b/revision.c
index fc66755..5bf8039 100644
--- a/revision.c
+++ b/revision.c
@@ -412,11 +412,31 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
 	commit->object.flags |= TREESAME;
 }
 
-static int add_parents_to_list(struct rev_info *revs, struct commit *commit, struct commit_list **list)
+static void insert_by_date_cached(struct commit *p, struct commit_list **head,
+		    struct commit_list *cached_base, struct commit_list **cache)
+{
+	struct commit_list *new_entry;
+    
+	if (cached_base && p->date < cached_base->item->date) {
+		new_entry = insert_by_date(p, &cached_base->next);
+	}
+	else {
+		new_entry = insert_by_date(p, head);
+	}
+	
+	if (cache && (!*cache || p->date < (*cache)->item->date)) {
+		*cache = new_entry;
+	}
+}
+
+static int add_parents_to_list(struct rev_info *revs, struct commit *commit,
+		    struct commit_list **list, struct commit_list **cache_ptr)
 {
 	struct commit_list *parent = commit->parents;
 	unsigned left_flag;
 
+	struct commit_list *cached_base = cache_ptr ? *cache_ptr : NULL;
+
 	if (commit->object.flags & ADDED)
 		return 0;
 	commit->object.flags |= ADDED;
@@ -445,7 +465,7 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, str
 			if (p->object.flags & SEEN)
 				continue;
 			p->object.flags |= SEEN;
-			insert_by_date(p, list);
+			insert_by_date_cached(p, list, cached_base, cache_ptr);
 		}
 		return 0;
 	}
@@ -470,7 +490,7 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, str
 		p->object.flags |= left_flag;
 		if (!(p->object.flags & SEEN)) {
 			p->object.flags |= SEEN;
-			insert_by_date(p, list);
+			insert_by_date_cached(p, list, cached_base, cache_ptr);
 		}
 		if(revs->first_parent_only)
 			break;
@@ -611,7 +631,7 @@ static int limit_list(struct rev_info *revs)
 
 		if (revs->max_age != -1 && (commit->date < revs->max_age))
 			obj->flags |= UNINTERESTING;
-		if (add_parents_to_list(revs, commit, &list) < 0)
+		if (add_parents_to_list(revs, commit, &list, NULL) < 0)
 			return -1;
 		if (obj->flags & UNINTERESTING) {
 			mark_parents_uninteresting(commit);
@@ -1458,10 +1478,12 @@ enum rewrite_result {
 
 static enum rewrite_result rewrite_one(struct rev_info *revs, struct commit **pp)
 {
+	struct commit_list *cache = NULL;
+
 	for (;;) {
 		struct commit *p = *pp;
 		if (!revs->limited)
-			if (add_parents_to_list(revs, p, &revs->commits) < 0)
+			if (add_parents_to_list(revs, p, &revs->commits, &cache) < 0)
 				return rewrite_one_error;
 		if (p->parents && p->parents->next)
 			return rewrite_one_ok;
@@ -1580,7 +1602,7 @@ static struct commit *get_revision_1(struct rev_info *revs)
 			if (revs->max_age != -1 &&
 			    (commit->date < revs->max_age))
 				continue;
-			if (add_parents_to_list(revs, commit, &revs->commits) < 0)
+			if (add_parents_to_list(revs, commit, &revs->commits, NULL) < 0)
 				return NULL;
 		}
 
-- 
1.5.6.2.24.ge09c4
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux