Re: [PATCH] Optimize rename detection for a huge diff

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Junio C Hamano <gitster@xxxxxxxxx> writes:

> With a bit of tweak, now I am getting these numbers to the
> rename detection that used to spend 800MB (the peak I observed
> was somewhere around 430MB).
>
> (after patch, in the kernel repository, master at 96b5a46)
> $ /usr/bin/time git-diff -M -l0 --name-status d19fbe8a7 master >/var/tmp/3
> 157.20user 1.03system 2:38.72elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
> 0inputs+0outputs (32major+237315minor)pagefaults 0swaps
>
> (before patch, same diff)
> $ /usr/bin/time git-diff -M -l0 --name-status d19fbe8a7 master >/var/tmp/4
> 174.00user 2.73system 3:09.55elapsed 93%CPU (0avgtext+0avgdata 0maxresident)k
> 0inputs+0outputs (6106major+459314minor)pagefaults 0swaps
>
> So it is not that much of an improvement, but it seems to help
> somewhat.
> ...
> We would need to see where the remaining 400MB is going and try
> to shrink it, but this would be an improvement so I'll soon be
> moving this to 'next'.

I noticed that massif reported diff_queue() very high in the
list, so came up with this patch on top of the previous ones.

162.75user 1.92system 2:45.22elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+215490minor)pagefaults 0swaps

Still not much of an improvement (10%, judging from the minor
fault counts?), and it risks crashing if there is an unconvered
free() that frees a diff_filepair that was obtained from the
bulk allocator.

---
 alloc.c               |   33 ++++++++++++++++++++
 builtin-diff-tree.c   |    2 +-
 builtin-rev-parse.c   |    2 +-
 builtin-send-pack.c   |    2 +-
 builtin-show-branch.c |    2 +-
 cache.h               |    4 ++
 commit.c              |   14 ++++----
 diff.c                |    4 +-
 diffcore-break.c      |   12 ++++---
 diffcore-rename.c     |   80 +++++++++++++++++++++++++++++++++++-------------
 diffcore.h            |    4 ++
 http-push.c           |    2 +-
 revision.c            |    6 ++--
 upload-pack.c         |    2 +-
 14 files changed, 124 insertions(+), 45 deletions(-)

diff --git a/alloc.c b/alloc.c
index 216c23a..08c2591 100644
--- a/alloc.c
+++ b/alloc.c
@@ -15,6 +15,8 @@
 #include "tree.h"
 #include "commit.h"
 #include "tag.h"
+#include "diff.h"
+#include "diffcore.h"
 
 #define BLOCKING 1024
 
@@ -51,6 +53,35 @@ DEFINE_ALLOCATOR(commit, struct commit)
 DEFINE_ALLOCATOR(tag, struct tag)
 DEFINE_ALLOCATOR(object, union any_object)
 
+#define DEFINE_FREEABLE_ALLOCATOR(name, type)			\
+DEFINE_ALLOCATOR(name, type)					\
+union freeable_##name {						\
+	union freeable_##name *next;				\
+	type body;						\
+};								\
+static union freeable_##name *name##_freepool;			\
+type *alloc_freeable_##name##_node(void)			\
+{								\
+	if (name##_freepool) {					\
+		union freeable_##name *one = name##_freepool;	\
+		name##_freepool = one->next;			\
+		memset(one, 0, sizeof(*one));			\
+		return &(one->body);				\
+	}							\
+	return alloc_##name##_node();				\
+}								\
+void free_##name##_node(type *it_)				\
+{								\
+	union freeable_##name *it = (union freeable_##name *)it_; \
+	if (it) {						\
+		it->next = name##_freepool;			\
+		name##_freepool = it;				\
+	}							\
+}
+
+DEFINE_FREEABLE_ALLOCATOR(commit_list, struct commit_list)
+DEFINE_FREEABLE_ALLOCATOR(diff_filepair, struct diff_filepair)
+
 #ifdef NO_C99_FORMAT
 #define SZ_FMT "%u"
 #else
@@ -73,4 +104,6 @@ void alloc_report(void)
 	REPORT(tree);
 	REPORT(commit);
 	REPORT(tag);
+	REPORT(commit_list);
 }
+
diff --git a/builtin-diff-tree.c b/builtin-diff-tree.c
index 832797f..04f93f5 100644
--- a/builtin-diff-tree.c
+++ b/builtin-diff-tree.c
@@ -36,7 +36,7 @@ static int diff_tree_stdin(char *line)
 		/* Free the real parent list */
 		for (parents = commit->parents; parents; ) {
 			struct commit_list *tmp = parents->next;
-			free(parents);
+			free_commit_list_node(parents);
 			parents = tmp;
 		}
 		commit->parents = NULL;
diff --git a/builtin-rev-parse.c b/builtin-rev-parse.c
index b9af1a5..580ec51 100644
--- a/builtin-rev-parse.c
+++ b/builtin-rev-parse.c
@@ -227,7 +227,7 @@ static int try_difference(const char *arg)
 				struct commit_list *n = exclude->next;
 				show_rev(REVERSED,
 					 exclude->item->object.sha1,NULL);
-				free(exclude);
+				free_commit_list_node(exclude);
 				exclude = n;
 			}
 		}
diff --git a/builtin-send-pack.c b/builtin-send-pack.c
index 8afb1d0..6a83258 100644
--- a/builtin-send-pack.c
+++ b/builtin-send-pack.c
@@ -82,7 +82,7 @@ static void unmark_and_free(struct commit_list *list, unsigned int mark)
 		struct commit_list *temp = list;
 		temp->item->object.flags &= ~mark;
 		list = temp->next;
-		free(temp);
+		free_commit_list_node(temp);
 	}
 }
 
diff --git a/builtin-show-branch.c b/builtin-show-branch.c
index 019abd3..0d7cf89 100644
--- a/builtin-show-branch.c
+++ b/builtin-show-branch.c
@@ -38,7 +38,7 @@ static struct commit *pop_one_commit(struct commit_list **list_p)
 	list = *list_p;
 	commit = list->item;
 	*list_p = list->next;
-	free(list);
+	free_commit_list_node(list);
 	return commit;
 }
 
diff --git a/cache.h b/cache.h
index 3867ba7..196a0d7 100644
--- a/cache.h
+++ b/cache.h
@@ -667,6 +667,10 @@ extern void *alloc_tree_node(void);
 extern void *alloc_commit_node(void);
 extern void *alloc_tag_node(void);
 extern void *alloc_object_node(void);
+
+extern struct commit_list *alloc_freeable_commit_list_node(void);
+extern void free_commit_list_node(struct commit_list *);
+
 extern void alloc_report(void);
 
 /* trace.c */
diff --git a/commit.c b/commit.c
index 8b8fb04..6696968 100644
--- a/commit.c
+++ b/commit.c
@@ -333,7 +333,7 @@ int parse_commit(struct commit *item)
 
 struct commit_list *commit_list_insert(struct commit *item, struct commit_list **list_p)
 {
-	struct commit_list *new_list = xmalloc(sizeof(struct commit_list));
+	struct commit_list *new_list = alloc_freeable_commit_list_node();
 	new_list->item = item;
 	new_list->next = *list_p;
 	*list_p = new_list;
@@ -345,11 +345,11 @@ void free_commit_list(struct commit_list *list)
 	while (list) {
 		struct commit_list *temp = list;
 		list = temp->next;
-		free(temp);
+		free_commit_list_node(temp);
 	}
 }
 
-struct commit_list * insert_by_date(struct commit *item, struct commit_list **list)
+struct commit_list *insert_by_date(struct commit *item, struct commit_list **list)
 {
 	struct commit_list **pp = list;
 	struct commit_list *p;
@@ -381,7 +381,7 @@ struct commit *pop_most_recent_commit(struct commit_list **list,
 	struct commit_list *old = *list;
 
 	*list = (*list)->next;
-	free(old);
+	free_commit_list_node(old);
 
 	while (parents) {
 		struct commit *commit = parents->item;
@@ -423,7 +423,7 @@ struct commit *pop_commit(struct commit_list **stack)
 
 	if (top) {
 		*stack = top->next;
-		free(top);
+		free_commit_list_node(top);
 	}
 	return item;
 }
@@ -568,7 +568,7 @@ static struct commit_list *merge_bases(struct commit *one, struct commit *two)
 
 		commit = list->item;
 		n = list->next;
-		free(list);
+		free_commit_list_node(list);
 		list = n;
 
 		flags = commit->object.flags & (PARENT1 | PARENT2 | STALE);
@@ -599,7 +599,7 @@ static struct commit_list *merge_bases(struct commit *one, struct commit *two)
 		struct commit_list *n = list->next;
 		if (!(list->item->object.flags & STALE))
 			insert_by_date(list->item, &result);
-		free(list);
+		free_commit_list_node(list);
 		list = n;
 	}
 	return result;
diff --git a/diff.c b/diff.c
index cd8bc4d..63ac8db 100644
--- a/diff.c
+++ b/diff.c
@@ -2433,7 +2433,7 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *queue,
 				 struct diff_filespec *one,
 				 struct diff_filespec *two)
 {
-	struct diff_filepair *dp = xcalloc(1, sizeof(*dp));
+	struct diff_filepair *dp = alloc_freeable_diff_filepair_node();
 	dp->one = one;
 	dp->two = two;
 	if (queue)
@@ -2445,7 +2445,7 @@ void diff_free_filepair(struct diff_filepair *p)
 {
 	free_filespec(p->one);
 	free_filespec(p->two);
-	free(p);
+	free_diff_filepair_node(p);
 }
 
 /* This is different from find_unique_abbrev() in that
diff --git a/diffcore-break.c b/diffcore-break.c
index 31cdcfe..debd26d 100644
--- a/diffcore-break.c
+++ b/diffcore-break.c
@@ -205,9 +205,11 @@ void diffcore_break(int break_score)
 				dp->score = score;
 				dp->broken_pair = 1;
 
-				free(p); /* not diff_free_filepair(), we are
-					  * reusing one and two here.
-					  */
+				/*
+				 * not diff_free_filepair(), we are
+				 * reusing one and two here.
+				 */
+				free_diff_filepair_node(p);
 				continue;
 			}
 		}
@@ -243,8 +245,8 @@ static void merge_broken(struct diff_filepair *p,
 	dp->score = p->score;
 	diff_free_filespec_data(d->two);
 	diff_free_filespec_data(c->one);
-	free(d);
-	free(c);
+	free_diff_filepair_node(d);
+	free_diff_filepair_node(c);
 }
 
 void diffcore_merge_broken(void)
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 3d37725..5974362 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -112,8 +112,8 @@ static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
 struct diff_score {
 	int src; /* index in rename_src */
 	int dst; /* index in rename_dst */
-	int score;
-	int name_score;
+	unsigned short score;
+	short name_score;
 };
 
 static int estimate_similarity(struct diff_filespec *src,
@@ -223,6 +223,12 @@ static int score_compare(const void *a_, const void *b_)
 {
 	const struct diff_score *a = a_, *b = b_;
 
+	/* sink the unused ones to the bottom */
+	if (a->dst < 0)
+		return (0 <= b->dst);
+	else if (b->dst < 0)
+		return -1;
+
 	if (a->score == b->score)
 		return b->name_score - a->name_score;
 
@@ -387,6 +393,22 @@ static int find_exact_renames(void)
 	return i;
 }
 
+#define NUM_CANDIDATE_PER_DST 4
+static void record_if_better(struct diff_score m[], struct diff_score *o)
+{
+	int i, worst;
+
+	/* find the worst one */
+	worst = 0;
+	for (i = 1; i < NUM_CANDIDATE_PER_DST; i++)
+		if (score_compare(&m[i], &m[worst]) > 0)
+			worst = i;
+
+	/* is it better than the worst one? */
+	if (score_compare(&m[worst], o) > 0)
+		m[worst] = *o;
+}
+
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
@@ -473,47 +495,61 @@ void diffcore_rename(struct diff_options *options)
 	if (num_create * num_src > rename_limit * rename_limit)
 		goto cleanup;
 
-	mx = xmalloc(sizeof(*mx) * num_create * num_src);
+	mx = xcalloc(num_create * NUM_CANDIDATE_PER_DST, sizeof(*mx));
 	for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
-		int base = dst_cnt * num_src;
 		struct diff_filespec *two = rename_dst[i].two;
+		struct diff_score *m;
+
 		if (rename_dst[i].pair)
 			continue; /* dealt with exact match already. */
+
+		m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
+		for (j = 0; j < NUM_CANDIDATE_PER_DST; j++)
+			m[j].dst = -1;
+
 		for (j = 0; j < rename_src_nr; j++) {
 			struct diff_filespec *one = rename_src[j].one;
-			struct diff_score *m = &mx[base+j];
-			m->src = j;
-			m->dst = i;
-			m->score = estimate_similarity(one, two,
-						       minimum_score);
-			m->name_score = basename_same(one, two);
+			struct diff_score this_src;
+			this_src.score = estimate_similarity(one, two,
+							     minimum_score);
+			this_src.name_score = basename_same(one, two);
+			this_src.dst = i;
+			this_src.src = j;
+			record_if_better(m, &this_src);
 			diff_free_filespec_blob(one);
 		}
 		/* We do not need the text anymore */
 		diff_free_filespec_blob(two);
 		dst_cnt++;
 	}
+
 	/* cost matrix sorted by most to least similar pair */
-	qsort(mx, num_create * num_src, sizeof(*mx), score_compare);
-	for (i = 0; i < num_create * num_src; i++) {
-		struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
-		struct diff_filespec *src;
+	qsort(mx, dst_cnt * NUM_CANDIDATE_PER_DST, sizeof(*mx), score_compare);
+
+	for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) {
+		struct diff_rename_dst *dst;
+
+		if ((mx[i].dst < 0) ||
+		    (mx[i].score < minimum_score))
+			break; /* there is no more usable pair. */
+		dst = &rename_dst[mx[i].dst];
 		if (dst->pair)
 			continue; /* already done, either exact or fuzzy. */
-		if (mx[i].score < minimum_score)
-			break; /* there is no more usable pair. */
-		src = rename_src[mx[i].src].one;
-		if (src->rename_used)
+		if (rename_src[mx[i].src].one->rename_used)
 			continue;
 		record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
 		rename_count++;
 	}
-	for (i = 0; i < num_create * num_src; i++) {
-		struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
+
+	for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) {
+		struct diff_rename_dst *dst;
+
+		if ((mx[i].dst < 0) ||
+		    (mx[i].score < minimum_score))
+			break; /* there is no more usable pair. */
+		dst = &rename_dst[mx[i].dst];
 		if (dst->pair)
 			continue; /* already done, either exact or fuzzy. */
-		if (mx[i].score < minimum_score)
-			break; /* there is no more usable pair. */
 		record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
 		rename_count++;
 	}
diff --git a/diffcore.h b/diffcore.h
index cc96c20..00aef4c 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -118,4 +118,8 @@ extern int diffcore_count_changes(struct diff_filespec *src,
 				  unsigned long *src_copied,
 				  unsigned long *literal_added);
 
+/* alloc.c */
+extern struct diff_filepair *alloc_freeable_diff_filepair_node(void);
+extern void free_diff_filepair_node(struct diff_filepair *);
+
 #endif
diff --git a/http-push.c b/http-push.c
index b2b410d..314141f 100644
--- a/http-push.c
+++ b/http-push.c
@@ -1817,7 +1817,7 @@ static void unmark_and_free(struct commit_list *list, unsigned int mark)
 		struct commit_list *temp = list;
 		temp->item->object.flags &= ~mark;
 		list = temp->next;
-		free(temp);
+		free_commit_list_node(temp);
 	}
 }
 
diff --git a/revision.c b/revision.c
index 6e85aaa..df9b062 100644
--- a/revision.c
+++ b/revision.c
@@ -571,7 +571,7 @@ static int limit_list(struct rev_info *revs)
 		show_early_output_fn_t show;
 
 		list = list->next;
-		free(entry);
+		free_commit_list_node(entry);
 
 		if (revs->max_age != -1 && (commit->date < revs->max_age))
 			obj->flags |= UNINTERESTING;
@@ -752,7 +752,7 @@ static void prepare_show_merge(struct rev_info *revs)
 	while (bases) {
 		struct commit *it = bases->item;
 		struct commit_list *n = bases->next;
-		free(bases);
+		free_commit_list_node(bases);
 		bases = n;
 		it->object.flags |= UNINTERESTING;
 		add_pending_object(revs, &it->object, "(merge-base)");
@@ -1472,7 +1472,7 @@ static struct commit *get_revision_1(struct rev_info *revs)
 		struct commit *commit = entry->item;
 
 		revs->commits = entry->next;
-		free(entry);
+		free_commit_list_node(entry);
 
 		if (revs->reflog_info)
 			fake_reflog_parent(revs->reflog_info, commit);
diff --git a/upload-pack.c b/upload-pack.c
index 51e3ec4..86a4e7e 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -331,7 +331,7 @@ static int reachable(struct commit *want)
 	while (work) {
 		struct commit_list *list = work->next;
 		struct commit *commit = work->item;
-		free(work);
+		free_commit_list_node(work);
 		work = list;
 
 		if (commit->object.flags & THEY_HAVE) {
-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux