[PATCH] Introduce patch factorization in diffcore.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



---

 diff-lib.c        |    6 +-
 diff.c            |    5 +
 diff.h            |    3 +
 diffcore-rename.c |  194 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 diffcore.h        |    1 
 tree-diff.c       |    4 +
 6 files changed, 202 insertions(+), 11 deletions(-)

diff --git a/diff-lib.c b/diff-lib.c
index ae96c64..dcc4c2c 100644
--- a/diff-lib.c
+++ b/diff-lib.c
@@ -179,7 +179,8 @@ int run_diff_files(struct rev_info *revs, unsigned int option)
 		changed = ce_match_stat(ce, &st, ce_option);
 		if (!changed) {
 			ce_mark_uptodate(ce);
-			if (!DIFF_OPT_TST(&revs->diffopt, FIND_COPIES_HARDER))
+			if (!DIFF_OPT_TST(&revs->diffopt, FIND_COPIES_HARDER) &&
+			    !DIFF_OPT_TST(&revs->diffopt, FACTORIZE_RENAMES))
 				continue;
 		}
 		oldmode = ce->ce_mode;
@@ -310,7 +311,8 @@ static int show_modified(struct oneway_unpack_data *cbdata,
 
 	oldmode = old->ce_mode;
 	if (mode == oldmode && !hashcmp(sha1, old->sha1) &&
-	    !DIFF_OPT_TST(&revs->diffopt, FIND_COPIES_HARDER))
+	    !DIFF_OPT_TST(&revs->diffopt, FIND_COPIES_HARDER) &&
+	    !DIFF_OPT_TST(&revs->diffopt, FACTORIZE_RENAMES))
 		return 0;
 
 	diff_change(&revs->diffopt, oldmode, mode,
diff --git a/diff.c b/diff.c
index 9053d4d..2315980 100644
--- a/diff.c
+++ b/diff.c
@@ -2599,6 +2599,11 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
 		DIFF_OPT_SET(options, REVERSE_DIFF);
 	else if (!strcmp(arg, "--find-copies-harder"))
 		DIFF_OPT_SET(options, FIND_COPIES_HARDER);
+	else if (!strcmp(arg, "--factorize-renames")) {
+		DIFF_OPT_SET(options, FACTORIZE_RENAMES);
+		if (!options->detect_rename)
+			options->detect_rename = DIFF_DETECT_RENAME;
+	}
 	else if (!strcmp(arg, "--follow"))
 		DIFF_OPT_SET(options, FOLLOW_RENAMES);
 	else if (!strcmp(arg, "--color"))
diff --git a/diff.h b/diff.h
index 1ca4f46..7b862c6 100644
--- a/diff.h
+++ b/diff.h
@@ -64,6 +64,7 @@ typedef void (*diff_format_fn_t)(struct diff_queue_struct *q,
 #define DIFF_OPT_RELATIVE_NAME       (1 << 17)
 #define DIFF_OPT_IGNORE_SUBMODULES   (1 << 18)
 #define DIFF_OPT_DIRSTAT_CUMULATIVE  (1 << 19)
+#define DIFF_OPT_FACTORIZE_RENAMES   (1 << 20)
 #define DIFF_OPT_TST(opts, flag)    ((opts)->flags & DIFF_OPT_##flag)
 #define DIFF_OPT_SET(opts, flag)    ((opts)->flags |= DIFF_OPT_##flag)
 #define DIFF_OPT_CLR(opts, flag)    ((opts)->flags &= ~DIFF_OPT_##flag)
@@ -219,6 +220,8 @@ extern void diffcore_std(struct diff_options *);
 "  -C            detect copies.\n" \
 "  --find-copies-harder\n" \
 "                try unchanged files as candidate for copy detection.\n" \
+"  --factorize_renames\n" \
+"                factorize renames of all files of a directory.\n" \
 "  -l<n>         limit rename attempts up to <n> paths.\n" \
 "  -O<file>      reorder diffs according to the <file>.\n" \
 "  -S<string>    find filepair whose only one side contains the string.\n" \
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 1b2ebb4..2757204 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -6,6 +6,8 @@
 #include "diffcore.h"
 #include "hash.h"
 
+#include <libgen.h>
+
 /* Table of rename/copy destinations */
 
 static struct diff_rename_dst {
@@ -52,6 +54,32 @@ static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
 	return &(rename_dst[first]);
 }
 
+static struct diff_rename_dst *locate_rename_dst_dir(struct diff_filespec *dir)
+{
+	/* code mostly duplicated from locate_rename_dst - not sure we
+	 * could merge them efficiently,though
+	 */
+	int first, last;
+	int dirlength = strlen(dir->path);
+
+	first = 0;
+	last = rename_dst_nr;
+	while (last > first) {
+		int next = (last + first) >> 1;
+		struct diff_rename_dst *dst = &(rename_dst[next]);
+		int cmp = strncmp(dir->path, dst->two->path, dirlength);
+		if (!cmp)
+			return dst;
+		if (cmp < 0) {
+			last = next;
+			continue;
+		}
+		first = next+1;
+	}
+	/* not found */
+	return NULL;
+}
+
 /* Table of rename/copy src files */
 static struct diff_rename_src {
 	struct diff_filespec *one;
@@ -409,6 +437,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o)
 		m[worst] = *o;
 }
 
+struct diff_dir_rename {
+	struct diff_filespec *one;
+	struct diff_filespec *two;
+	int discarded;
+	struct diff_dir_rename* next;
+};
+
+/*
+ * Marks as such file_rename if it is made uninteresting by dir_rename.
+ * Returns -1 if the file_rename is outside of the range in which given
+ * renames concerned by dir_rename are to be found (ie. end of loop),
+ * 0 otherwise.
+ */
+static int maybe_mark_uninteresting(struct diff_rename_dst* file_rename,
+				    struct diff_dir_rename* dir_rename,
+				    int one_len, int two_len) {
+	if (strncmp(file_rename->two->path,
+		    dir_rename->two->path, two_len))
+		return -1;
+	if (strncmp(file_rename->pair->one->path,
+		    dir_rename->one->path, one_len) ||
+	    !basename_same(file_rename->pair->one, file_rename->two) ||
+	    file_rename->pair->score != MAX_SCORE)
+		return 0;
+
+	file_rename->pair->uninteresting_rename = 1;
+	printf ("[DBG] %s* -> %s* makes %s -> %s uninteresting\n",
+		dir_rename->one->path, dir_rename->two->path,
+		file_rename->pair->one->path, file_rename->two->path);
+	return 0;
+}
+
+/*
+ * FIXME: we could optimize the 100%-rename case by preventing
+ * recursion to unfold what we know we would refold here.
+ * FIXME: do we want to replace linked list with sorted array ?
+ * FIXME: this prototype only handles renaming of dirs without
+ * a subdir.
+ * FIXME: leaks like hell.
+ * FIXME: ideas to evaluate a similarity score, anyone ?
+ *  10% * tree similarity + 90% * moved files similarity ?
+ */
+static struct diff_dir_rename* factorization_candidates = NULL;
+static void diffcore_factorize_renames(void)
+{
+	char buffer[PATH_MAX], one_parent_path[PATH_MAX], two_parent_path[PATH_MAX];
+	int i;
+
+	for (i = 0; i < rename_dst_nr; i++) {
+		if (!rename_dst[i].pair)
+			continue;
+		strcpy(buffer, rename_dst[i].pair->one->path);
+		strcpy(one_parent_path, dirname(buffer));
+		strcat(one_parent_path, "/");
+
+		struct diff_filespec* one_parent = alloc_filespec(one_parent_path);
+		fill_filespec(one_parent, null_sha1 /*FIXME*/, S_IFDIR);
+
+		if (!locate_rename_dst_dir(one_parent)) {
+			// one_parent_path is empty in result tree
+
+			// already considered ?
+			struct diff_dir_rename* seen;
+			for (seen=factorization_candidates; seen; seen = seen->next)
+				if (!strcmp(seen->one->path, one_parent_path)) break;
+			if (!seen) {
+				// record potential dir rename
+				strcpy(buffer, rename_dst[i].pair->two->path);
+				strcpy(two_parent_path, dirname(buffer));
+				strcat(two_parent_path, "/");
+
+				seen = xmalloc(sizeof(*seen));
+				seen->one = one_parent;
+				seen->two = alloc_filespec(two_parent_path);
+				fill_filespec(seen->two, null_sha1 /*FIXME*/, S_IFDIR);
+				seen->discarded = 0;
+				seen->next = factorization_candidates;
+				factorization_candidates = seen;
+				printf("[DBG] possible rename from %s to %s\n",
+				       one_parent_path, two_parent_path);
+				continue;
+			}
+			if (seen->discarded)
+				continue;
+			// check that seen entry matches this rename
+			strcpy(buffer, rename_dst[i].pair->two->path);
+			strcpy(two_parent_path, dirname(buffer));
+			strcat(two_parent_path, "/");
+			if (strcmp(two_parent_path, seen->two->path)) {
+				printf("[DBG] discarding dir rename of %s, mixing %s and %s\n",
+				       one_parent_path, two_parent_path, seen->two->path);
+				seen->discarded = 1;
+			}
+
+			/* all checks ok, we keep that entry */
+		}
+	}
+
+	// turn candidates into real renames
+	struct diff_dir_rename* candidate;
+	for (candidate=factorization_candidates; candidate; candidate = candidate->next) {
+		int two_idx, i, one_len, two_len;
+		if (candidate->discarded)
+			continue;
+
+		// any entry within candidate dst dir
+		two_idx = locate_rename_dst_dir(candidate->two) - rename_dst;
+
+		// now remove extraneous 100% files inside.
+		one_len = strlen(candidate->one->path);
+		two_len = strlen(candidate->two->path);
+		for (i = two_idx; i < rename_dst_nr; i++)
+			if (maybe_mark_uninteresting (rename_dst+i, candidate,
+						      one_len, two_len) < 0)
+				break;
+		for (i = two_idx-1; i >= 0; i--)
+			if (maybe_mark_uninteresting (rename_dst+i, candidate,
+						      one_len, two_len) < 0)
+				break;
+	}
+
+	return;
+}
+
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
@@ -446,13 +598,22 @@ void diffcore_rename(struct diff_options *options)
 				p->one->rename_used++;
 			register_rename_src(p->one, p->score);
 		}
-		else if (detect_rename == DIFF_DETECT_COPY) {
-			/*
-			 * Increment the "rename_used" score by
-			 * one, to indicate ourselves as a user.
-			 */
-			p->one->rename_used++;
-			register_rename_src(p->one, p->score);
+		else {
+			if (detect_rename == DIFF_DETECT_COPY) {
+				/*
+				 * Increment the "rename_used" score by
+				 * one, to indicate ourselves as a user.
+				 */
+				p->one->rename_used++;
+				register_rename_src(p->one, p->score);
+			}
+			if (DIFF_OPT_TST(options, FACTORIZE_RENAMES)) {
+				/* similarly, rename factorization needs to
+				 * see all files from second tree
+				 */
+				//p->two->rename_used++; // FIXME: would we need that ?
+				locate_rename_dst(p->two, 1);
+			}
 		}
 	}
 	if (rename_dst_nr == 0 || rename_src_nr == 0)
@@ -561,8 +722,24 @@ void diffcore_rename(struct diff_options *options)
 	/* At this point, we have found some renames and copies and they
 	 * are recorded in rename_dst.  The original list is still in *q.
 	 */
+
+	/* Now possibly factorize those renames and copies. */
+	if (DIFF_OPT_TST(options, FACTORIZE_RENAMES))
+		diffcore_factorize_renames();
+
 	outq.queue = NULL;
 	outq.nr = outq.alloc = 0;
+
+	// first, turn factorization_candidates into real renames
+	struct diff_dir_rename* candidate;
+	for (candidate=factorization_candidates; candidate; candidate = candidate->next) {
+		struct diff_filepair* pair;
+		if (candidate->discarded) continue;
+		pair = diff_queue(&outq, candidate->one, candidate->two);
+		pair->score = MAX_SCORE;
+		pair->renamed_pair = 1;
+	}
+
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 		struct diff_filepair *pair_to_free = NULL;
@@ -577,7 +754,8 @@ void diffcore_rename(struct diff_options *options)
 			struct diff_rename_dst *dst =
 				locate_rename_dst(p->two, 0);
 			if (dst && dst->pair) {
-				diff_q(&outq, dst->pair);
+				if (!dst->pair->uninteresting_rename)
+					diff_q(&outq, dst->pair);
 				pair_to_free = p;
 			}
 			else
diff --git a/diffcore.h b/diffcore.h
index 8ae3578..e4aa099 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -62,6 +62,7 @@ struct diff_filepair {
 	unsigned broken_pair : 1;
 	unsigned renamed_pair : 1;
 	unsigned is_unmerged : 1;
+	unsigned uninteresting_rename : 1;
 };
 #define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged)
 
diff --git a/tree-diff.c b/tree-diff.c
index 9f67af6..872f757 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -49,7 +49,9 @@ static int compare_tree_entry(struct tree_desc *t1, struct tree_desc *t2, const
 		show_entry(opt, "+", t2, base, baselen);
 		return 1;
 	}
-	if (!DIFF_OPT_TST(opt, FIND_COPIES_HARDER) && !hashcmp(sha1, sha2) && mode1 == mode2)
+	if (!DIFF_OPT_TST(opt, FIND_COPIES_HARDER) &&
+	    !DIFF_OPT_TST(opt, FACTORIZE_RENAMES) &&
+	    !hashcmp(sha1, sha2) && mode1 == mode2)
 		return 0;
 
 	/*

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux