[PATCH/RFC 3/3] handle renames using similarity engine

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This changes diffcore-rename to use the engine in
similarity.c rather than doing an O(m*n) loop around
diffcore_count_changes.

Signed-off-by: Jeff King <peff@xxxxxxxx>
---
 diffcore-rename.c |  215 +++++++++++++++++++----------------------------------
 1 files changed, 76 insertions(+), 139 deletions(-)

diff --git a/diffcore-rename.c b/diffcore-rename.c
index ba038af..6c0d2d0 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -5,12 +5,21 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "hash.h"
+#include "similarity.h"
 
-/* Table of rename/copy destinations */
+/* Table of rename/copy src files */
+static struct diff_rename_src {
+	struct diff_filespec *one;
+	unsigned short score; /* to remember the break score */
+} *rename_src;
+static int rename_src_nr, rename_src_alloc;
 
+/* Table of rename/copy destinations */
 static struct diff_rename_dst {
 	struct diff_filespec *two;
 	struct diff_filepair *pair;
+	struct diff_rename_src *best_match;
+	unsigned score;
 } *rename_dst;
 static int rename_dst_nr, rename_dst_alloc;
 
@@ -49,16 +58,11 @@ static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
 	rename_dst[first].two = alloc_filespec(two->path);
 	fill_filespec(rename_dst[first].two, two->sha1, two->mode);
 	rename_dst[first].pair = NULL;
+	rename_dst[first].best_match = NULL;
+	rename_dst[first].score = 0;
 	return &(rename_dst[first]);
 }
 
-/* Table of rename/copy src files */
-static struct diff_rename_src {
-	struct diff_filespec *one;
-	unsigned short score; /* to remember the break score */
-} *rename_src;
-static int rename_src_nr, rename_src_alloc;
-
 static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
 						   unsigned short score)
 {
@@ -109,88 +113,6 @@ static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
 		(!dst_len || dst->path[dst_len - 1] == '/');
 }
 
-struct diff_score {
-	int src; /* index in rename_src */
-	int dst; /* index in rename_dst */
-	int score;
-	int name_score;
-};
-
-static int estimate_similarity(struct diff_filespec *src,
-			       struct diff_filespec *dst,
-			       int minimum_score)
-{
-	/* src points at a file that existed in the original tree (or
-	 * optionally a file in the destination tree) and dst points
-	 * at a newly created file.  They may be quite similar, in which
-	 * case we want to say src is renamed to dst or src is copied into
-	 * dst, and then some edit has been applied to dst.
-	 *
-	 * Compare them and return how similar they are, representing
-	 * the score as an integer between 0 and MAX_SCORE.
-	 *
-	 * When there is an exact match, it is considered a better
-	 * match than anything else; the destination does not even
-	 * call into this function in that case.
-	 */
-	unsigned long max_size, delta_size, base_size, src_copied, literal_added;
-	unsigned long delta_limit;
-	int score;
-
-	/* We deal only with regular files.  Symlink renames are handled
-	 * only when they are exact matches --- in other words, no edits
-	 * after renaming.
-	 */
-	if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
-		return 0;
-
-	/*
-	 * Need to check that source and destination sizes are
-	 * filled in before comparing them.
-	 *
-	 * If we already have "cnt_data" filled in, we know it's
-	 * all good (avoid checking the size for zero, as that
-	 * is a possible size - we really should have a flag to
-	 * say whether the size is valid or not!)
-	 */
-	if (!src->cnt_data && diff_populate_filespec(src, 0))
-		return 0;
-	if (!dst->cnt_data && diff_populate_filespec(dst, 0))
-		return 0;
-
-	max_size = ((src->size > dst->size) ? src->size : dst->size);
-	base_size = ((src->size < dst->size) ? src->size : dst->size);
-	delta_size = max_size - base_size;
-
-	/* We would not consider edits that change the file size so
-	 * drastically.  delta_size must be smaller than
-	 * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
-	 *
-	 * Note that base_size == 0 case is handled here already
-	 * and the final score computation below would not have a
-	 * divide-by-zero issue.
-	 */
-	if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
-		return 0;
-
-	delta_limit = (unsigned long)
-		(base_size * (MAX_SCORE-minimum_score) / MAX_SCORE);
-	if (diffcore_count_changes(src, dst,
-				   &src->cnt_data, &dst->cnt_data,
-				   delta_limit,
-				   &src_copied, &literal_added))
-		return 0;
-
-	/* How similar are they?
-	 * what percentage of material in dst are from source?
-	 */
-	if (!dst->size)
-		score = 0; /* should not happen */
-	else
-		score = (int)(src_copied * MAX_SCORE / max_size);
-	return score;
-}
-
 static void record_rename_pair(int dst_index, int src_index, int score)
 {
 	struct diff_filespec *src, *dst;
@@ -215,20 +137,6 @@ static void record_rename_pair(int dst_index, int src_index, int score)
 	rename_dst[dst_index].pair = dp;
 }
 
-/*
- * We sort the rename similarity matrix with the score, in descending
- * order (the most similar first).
- */
-static int score_compare(const void *a_, const void *b_)
-{
-	const struct diff_score *a = a_, *b = b_;
-
-	if (a->score == b->score)
-		return b->name_score - a->name_score;
-
-	return b->score - a->score;
-}
-
 struct file_similarity {
 	int src_dst, index;
 	struct diff_filespec *filespec;
@@ -376,6 +284,67 @@ static int find_exact_renames(void)
 	return i;
 }
 
+static void record_similarity(void *vsrc, void *vdst, unsigned score)
+{
+	struct diff_rename_src *src = vsrc;
+	struct diff_rename_dst *dst = vdst;
+	unsigned max_size = (src->one->size > dst->two->size) ?
+				src->one->size : dst->two->size;
+
+	score = (dst->two->size != 0) ? (score * MAX_SCORE / max_size) : 0;
+
+	/* Is there a match already that is better than we are? */
+	if (dst->best_match) {
+		if (score < dst->score)
+			return;
+		if (score == dst->score && !basename_same(src->one, dst->two))
+			return;
+	}
+
+	dst->best_match = src;
+	dst->score = score;
+}
+
+static void find_approximate_renames(int minimum_score)
+{
+	struct similarity sim;
+	int i;
+
+	similarity_init(&sim);
+
+	for (i = 0; i < rename_src_nr; i++) {
+		struct diff_rename_src *s = &rename_src[i];
+		diff_populate_filespec(s->one, 0);
+		similarity_add(&sim, SIMILARITY_SOURCE, s,
+				s->one->data, s->one->size,
+				diff_filespec_is_binary(s->one));
+		diff_free_filespec_data(s->one);
+	}
+
+	for (i = 0; i < rename_dst_nr; i++) {
+		struct diff_rename_dst *d = &rename_dst[i];
+		if (d->pair)
+			continue;
+		diff_populate_filespec(d->two, 0);
+		similarity_add(&sim, SIMILARITY_DEST, d,
+				d->two->data, d->two->size,
+				diff_filespec_is_binary(d->two));
+		diff_free_filespec_data(d->two);
+	}
+
+	similarity_score(&sim);
+	similarity_report(&sim, record_similarity);
+
+	for (i = 0 ; i < rename_dst_nr; i++) {
+		struct diff_rename_dst *d = &rename_dst[i];
+		if (d->pair)
+			continue;
+		if (d->score < minimum_score)
+			continue;
+		record_rename_pair(i, d->best_match - rename_src, d->score);
+	}
+}
+
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
@@ -383,9 +352,8 @@ void diffcore_rename(struct diff_options *options)
 	int rename_limit = options->rename_limit;
 	struct diff_queue_struct *q = &diff_queued_diff;
 	struct diff_queue_struct outq;
-	struct diff_score *mx;
-	int i, j, rename_count;
-	int num_create, num_src, dst_cnt;
+	int num_create, num_src;
+	int i, rename_count;
 
 	if (!minimum_score)
 		minimum_score = DEFAULT_RENAME_SCORE;
@@ -462,38 +430,7 @@ void diffcore_rename(struct diff_options *options)
 	if (num_create * num_src > rename_limit * rename_limit)
 		goto cleanup;
 
-	mx = xmalloc(sizeof(*mx) * num_create * num_src);
-	for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
-		int base = dst_cnt * num_src;
-		struct diff_filespec *two = rename_dst[i].two;
-		if (rename_dst[i].pair)
-			continue; /* dealt with exact match already. */
-		for (j = 0; j < rename_src_nr; j++) {
-			struct diff_filespec *one = rename_src[j].one;
-			struct diff_score *m = &mx[base+j];
-			m->src = j;
-			m->dst = i;
-			m->score = estimate_similarity(one, two,
-						       minimum_score);
-			m->name_score = basename_same(one, two);
-			diff_free_filespec_blob(one);
-		}
-		/* We do not need the text anymore */
-		diff_free_filespec_blob(two);
-		dst_cnt++;
-	}
-	/* cost matrix sorted by most to least similar pair */
-	qsort(mx, num_create * num_src, sizeof(*mx), score_compare);
-	for (i = 0; i < num_create * num_src; i++) {
-		struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
-		if (dst->pair)
-			continue; /* already done, either exact or fuzzy. */
-		if (mx[i].score < minimum_score)
-			break; /* there is no more usable pair. */
-		record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
-		rename_count++;
-	}
-	free(mx);
+	find_approximate_renames(minimum_score);
 
  cleanup:
 	/* At this point, we have found some renames and copies and they
-- 
1.5.3.4
-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux