[PATCH v2 1/2] diffcore-rename: support rename cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch teaches diffcore_rename() to look into
$GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
With proper cache, there should be no available entry for estimation
after exact matching.

Rename caching is per commit. I don't think abitrary tree-tree caching
is worth it.

$GIT_DIR/rename-cache spans out like $GIT_DIR/objects. Each file
corresponds to one commit. Its content consists of lines like this

<Destination SHA-1> <SPC> <Source SHA-1> <SPC> <Score in decimal> <NL>

This can be used to:

 - Make --find-copies-harder pratically usable for moderate-size
   repositories. The first "git show" on a linux kernel commit was 5.3
   sec, it then went down to 0.13 sec.
 - Give git-svn a chance to (locally) import explicit renames from
   Subversion
 - People may correct rename results for better diff, if automatic
   rename detection is not good enough.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 >  > > Has anybody thought about interaction between that caching and pathspec
 >  > >  limited operation?
 >  > >
 >  >
 >  > I didn't. But I think all out-of-pathspec diff pairs are removed
 >  > before it reaches diffcore_rename() so the cache has nothing to do
 >  > with it (except it still loads full cache for a commit).
 >
 >
 > Well, it could be that an out-of-pathspec pair would have a better
 >  score than an in-pathspec one.  Maybe cache recording should be turned
 >  off when doing pathspec limitation ?

 Changes from v1:
  - rebased to next to avoid conflict
  - no longer generate cache if pathspec is used

 diff.h                  |    2 +
 diffcore-rename.c       |  142 ++++++++++++++++++++++++++++++++++++++++++++++-
 log-tree.c              |    2 +
 t/t4031-rename-cache.sh |   56 ++++++++++++++++++
 4 files changed, 200 insertions(+), 2 deletions(-)
 create mode 100755 t/t4031-rename-cache.sh

diff --git a/diff.h b/diff.h
index 42582ed..64a1edd 100644
--- a/diff.h
+++ b/diff.h
@@ -111,6 +111,8 @@ struct diff_options {
 	add_remove_fn_t add_remove;
 	diff_format_fn_t format_callback;
 	void *format_callback_data;
+
+	struct commit *commit;
 };
 
 enum color_diff {
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 168a95b..598cc8d 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -5,6 +5,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "hash.h"
+#include "commit.h"
 
 /* Table of rename/copy destinations */
 
@@ -409,13 +410,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o)
 		m[worst] = *o;
 }
 
+struct cached_filepair {
+	unsigned char dst[20];
+	unsigned char src[20];
+	int score;
+};
+
+static int free_cached_filepair(void *p)
+{
+	free(p);
+	return 0;
+}
+
+static void load_rename_cache(struct diff_queue_struct *q,
+			      struct diff_queue_struct *cacheq,
+			      struct diff_options *options)
+{
+	char *sha1_hex;
+	FILE *fp;
+	struct hash_table filepair_table;
+	struct hash_table src_table;
+	struct cached_filepair *pp;
+	int i, hash;
+	static int no_cache_available = -1;
+	struct stat st;
+	char *path;
+
+	if (no_cache_available == -1)
+		no_cache_available = stat(git_path("rename-cache"), &st) || !S_ISDIR(st.st_mode);
+
+	/* return soon so we don't need to waste CPU */
+	if (no_cache_available > 0)
+		return;
+
+
+	/* src_table initialization */
+	init_hash(&src_table);
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		if (DIFF_FILE_VALID(p->one)) {
+			unsigned int hash = hash_filespec(p->one);
+			insert_hash(hash, p, &src_table);
+		}
+	}
+
+	/* filepair_table initialization */
+	init_hash(&filepair_table);
+	sha1_hex = sha1_to_hex(options->commit->object.sha1);
+	path = git_path("rename-cache/%c%c/%s",sha1_hex[0], sha1_hex[1], sha1_hex+2);
+	if (stat(path, &st))
+		fp = NULL;
+	else
+		fp = fopen(path, "r");
+	if (fp) {
+		char src_sha1_hex[41], dst_sha1_hex[41];
+		struct cached_filepair p;
+
+		src_sha1_hex[40] = dst_sha1_hex[40] = '\0';
+		while (fscanf(fp, "%40c %40c %d\n", dst_sha1_hex, src_sha1_hex, &p.score) == 3) {
+			if (get_sha1_hex(src_sha1_hex, p.src) ||
+			    get_sha1_hex(dst_sha1_hex, p.dst))
+				break;
+
+			pp = xmalloc(sizeof(*pp));
+			memcpy(pp, &p, sizeof(*pp));
+			memcpy(&hash, p.dst, sizeof(hash));
+			insert_hash(hash, pp, &filepair_table);
+		}
+		fclose(fp);
+	}
+
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		struct diff_filepair *dp, *src;
+
+		/* find remote_dst */
+		if (DIFF_FILE_VALID(p->one) ||
+		    !DIFF_FILE_VALID(p->two) ||
+		    (options->single_follow && strcmp(options->single_follow, p->two->path)))
+			continue;
+
+		memcpy(&hash, p->two->sha1, sizeof(hash));
+		pp = lookup_hash(hash, &filepair_table);
+		if (!pp || memcmp(p->two->sha1, pp->dst, 20))
+			continue;
+
+		/* create pair */
+		if (is_null_sha1(pp->src)) {
+			if (DIFF_FILE_VALID(p->one))
+				continue;
+			diff_q(cacheq, p);
+			q->queue[i] = NULL;
+			continue;
+		}
+
+		memcpy(&hash, pp->src, sizeof(hash));
+		src = lookup_hash(hash, &src_table);
+		if (!src || memcmp(pp->src, src->one->sha1, 20))
+			continue;
+
+		src->one->rename_used++;
+		src->one->count++;
+		p->two->count++;
+
+		dp = diff_queue(NULL, src->one, p->two);
+		dp->renamed_pair = 1;
+		dp->score = pp->score;
+
+		diff_q(cacheq, dp);
+		q->queue[i] = NULL;
+		diff_free_filepair(p);
+	}
+
+	for_each_hash(&filepair_table, free_cached_filepair);
+	free_hash(&src_table);
+	free_hash(&filepair_table);
+}
+
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
 	int minimum_score = options->rename_score;
 	int rename_limit = options->rename_limit;
 	struct diff_queue_struct *q = &diff_queued_diff;
-	struct diff_queue_struct outq;
+	struct diff_queue_struct outq, cacheq;
 	struct diff_score *mx;
 	int i, j, rename_count;
 	int num_create, num_src, dst_cnt;
@@ -423,8 +541,19 @@ void diffcore_rename(struct diff_options *options)
 	if (!minimum_score)
 		minimum_score = DEFAULT_RENAME_SCORE;
 
+	cacheq.queue = NULL;
+	cacheq.nr = cacheq.alloc = 0;
+
+	if (detect_rename && options->commit)
+		load_rename_cache(q, &cacheq, options);
+
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
+
+		/* was consumed by rename cache */
+		if (!p)
+			continue;
+
 		if (!DIFF_FILE_VALID(p->one)) {
 			if (!DIFF_FILE_VALID(p->two))
 				continue; /* unmerged */
@@ -563,10 +692,17 @@ void diffcore_rename(struct diff_options *options)
 	 */
 	outq.queue = NULL;
 	outq.nr = outq.alloc = 0;
-	for (i = 0; i < q->nr; i++) {
+	for (i = j = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 		struct diff_filepair *pair_to_free = NULL;
 
+		if (!p) {
+			if (j >= cacheq.nr)
+				die("Internal error: running out of cacheq.");
+			diff_q(&outq, cacheq.queue[j++]);
+			continue;
+		}
+
 		if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
 			/*
 			 * Creation
@@ -635,6 +771,8 @@ void diffcore_rename(struct diff_options *options)
 	diff_debug_queue("done copying original", &outq);
 
 	free(q->queue);
+	if (cacheq.queue)
+		free(cacheq.queue);
 	*q = outq;
 	diff_debug_queue("done collapsing", q);
 
diff --git a/log-tree.c b/log-tree.c
index 5444f08..040e095 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -522,6 +522,7 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
 	log.commit = commit;
 	log.parent = NULL;
 	opt->loginfo = &log;
+	opt->diffopt.commit = commit;
 
 	shown = log_tree_diff(opt, commit, &log);
 	if (!shown && opt->loginfo && opt->always_show_header) {
@@ -531,5 +532,6 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
 	}
 	opt->loginfo = NULL;
 	maybe_flush_or_die(stdout, "stdout");
+	opt->diffopt.commit = NULL;
 	return shown;
 }
diff --git a/t/t4031-rename-cache.sh b/t/t4031-rename-cache.sh
new file mode 100755
index 0000000..f7c53fd
--- /dev/null
+++ b/t/t4031-rename-cache.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# Copyright (c) 2008 Nguyen Thai Ngoc Duy
+#
+
+test_description='Test diff rename cache'
+. ./test-lib.sh
+
+cat >expected <<EOF
+ create mode 100644 sub/c
+ copy a => sub/d (100%)
+EOF
+test_expect_success 'setup' '
+	echo a > a
+	echo b > b
+	mkdir sub
+	echo c > sub/c
+	cp a sub/d
+	A_SHA1=$(git hash-object a)
+	B_SHA1=$(git hash-object b)
+	C_SHA1=$(git hash-object sub/c)
+	D_SHA1=$(git hash-object sub/d)
+	git add a b
+	test_tick
+	git commit -m first
+	git add sub
+	git commit -m second
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => sub/c (100%)
+ copy a => sub/d (100%)
+EOF
+test_expect_success 'load rename pair cache' '
+	P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,")
+	mkdir -p $(dirname $P)
+	echo $C_SHA1 $A_SHA1 60000 >> $P
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => sub/c (100%)
+ create mode 100644 sub/d
+EOF
+test_expect_success 'load create pair cache' '
+	P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,")
+	mkdir -p $(dirname $P)
+	echo $D_SHA1 0000000000000000000000000000000000000000 0 >> $P
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+test_done
-- 
1.6.0.3.892.g83538

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux