This patch teaches diffcore_rename() to look into $GIT_DIR/rename-cache and make use of it to recreate diff_filepair. With proper cache, there should be no available entry for estimation after exact matching. Rename caching is per commit. I don't think abitrary tree-tree caching is worth it. $GIT_DIR/rename-cache spans out like $GIT_DIR/objects. Each file corresponds to one commit. Its content consists of lines like this <Destination SHA-1> <SPC> <Source SHA-1> <SPC> <Score in decimal> <NL> This can be used to: - Make --find-copies-harder pratically usable for moderate-size repositories. The first "git show" on a linux kernel commit was 5.3 sec, it then went down to 0.13 sec. - Give git-svn a chance to (locally) import explicit renames from Subversion - People may correct rename results for better diff, if automatic rename detection is not good enough. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> --- diff.h | 2 + diffcore-rename.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++- log-tree.c | 2 + t/t4030-rename-cache.sh | 55 ++++++++++++++++++ 4 files changed, 199 insertions(+), 2 deletions(-) create mode 100755 t/t4030-rename-cache.sh diff --git a/diff.h b/diff.h index a49d865..8b68f6f 100644 --- a/diff.h +++ b/diff.h @@ -110,6 +110,8 @@ struct diff_options { add_remove_fn_t add_remove; diff_format_fn_t format_callback; void *format_callback_data; + + struct commit *commit; }; enum color_diff { diff --git a/diffcore-rename.c b/diffcore-rename.c index 168a95b..598cc8d 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -5,6 +5,7 @@ #include "diff.h" #include "diffcore.h" #include "hash.h" +#include "commit.h" /* Table of rename/copy destinations */ @@ -409,13 +410,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o) m[worst] = *o; } +struct cached_filepair { + unsigned char dst[20]; + unsigned char src[20]; + int score; +}; + +static int free_cached_filepair(void *p) +{ + free(p); + return 0; +} + +static void load_rename_cache(struct diff_queue_struct *q, + struct diff_queue_struct *cacheq, + struct diff_options *options) +{ + char *sha1_hex; + FILE *fp; + struct hash_table filepair_table; + struct hash_table src_table; + struct cached_filepair *pp; + int i, hash; + static int no_cache_available = -1; + struct stat st; + char *path; + + if (no_cache_available == -1) + no_cache_available = stat(git_path("rename-cache"), &st) || !S_ISDIR(st.st_mode); + + /* return soon so we don't need to waste CPU */ + if (no_cache_available > 0) + return; + + + /* src_table initialization */ + init_hash(&src_table); + for (i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + if (DIFF_FILE_VALID(p->one)) { + unsigned int hash = hash_filespec(p->one); + insert_hash(hash, p, &src_table); + } + } + + /* filepair_table initialization */ + init_hash(&filepair_table); + sha1_hex = sha1_to_hex(options->commit->object.sha1); + path = git_path("rename-cache/%c%c/%s",sha1_hex[0], sha1_hex[1], sha1_hex+2); + if (stat(path, &st)) + fp = NULL; + else + fp = fopen(path, "r"); + if (fp) { + char src_sha1_hex[41], dst_sha1_hex[41]; + struct cached_filepair p; + + src_sha1_hex[40] = dst_sha1_hex[40] = '\0'; + while (fscanf(fp, "%40c %40c %d\n", dst_sha1_hex, src_sha1_hex, &p.score) == 3) { + if (get_sha1_hex(src_sha1_hex, p.src) || + get_sha1_hex(dst_sha1_hex, p.dst)) + break; + + pp = xmalloc(sizeof(*pp)); + memcpy(pp, &p, sizeof(*pp)); + memcpy(&hash, p.dst, sizeof(hash)); + insert_hash(hash, pp, &filepair_table); + } + fclose(fp); + } + + for (i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + struct diff_filepair *dp, *src; + + /* find remote_dst */ + if (DIFF_FILE_VALID(p->one) || + !DIFF_FILE_VALID(p->two) || + (options->single_follow && strcmp(options->single_follow, p->two->path))) + continue; + + memcpy(&hash, p->two->sha1, sizeof(hash)); + pp = lookup_hash(hash, &filepair_table); + if (!pp || memcmp(p->two->sha1, pp->dst, 20)) + continue; + + /* create pair */ + if (is_null_sha1(pp->src)) { + if (DIFF_FILE_VALID(p->one)) + continue; + diff_q(cacheq, p); + q->queue[i] = NULL; + continue; + } + + memcpy(&hash, pp->src, sizeof(hash)); + src = lookup_hash(hash, &src_table); + if (!src || memcmp(pp->src, src->one->sha1, 20)) + continue; + + src->one->rename_used++; + src->one->count++; + p->two->count++; + + dp = diff_queue(NULL, src->one, p->two); + dp->renamed_pair = 1; + dp->score = pp->score; + + diff_q(cacheq, dp); + q->queue[i] = NULL; + diff_free_filepair(p); + } + + for_each_hash(&filepair_table, free_cached_filepair); + free_hash(&src_table); + free_hash(&filepair_table); +} + void diffcore_rename(struct diff_options *options) { int detect_rename = options->detect_rename; int minimum_score = options->rename_score; int rename_limit = options->rename_limit; struct diff_queue_struct *q = &diff_queued_diff; - struct diff_queue_struct outq; + struct diff_queue_struct outq, cacheq; struct diff_score *mx; int i, j, rename_count; int num_create, num_src, dst_cnt; @@ -423,8 +541,19 @@ void diffcore_rename(struct diff_options *options) if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; + cacheq.queue = NULL; + cacheq.nr = cacheq.alloc = 0; + + if (detect_rename && options->commit) + load_rename_cache(q, &cacheq, options); + for (i = 0; i < q->nr; i++) { struct diff_filepair *p = q->queue[i]; + + /* was consumed by rename cache */ + if (!p) + continue; + if (!DIFF_FILE_VALID(p->one)) { if (!DIFF_FILE_VALID(p->two)) continue; /* unmerged */ @@ -563,10 +692,17 @@ void diffcore_rename(struct diff_options *options) */ outq.queue = NULL; outq.nr = outq.alloc = 0; - for (i = 0; i < q->nr; i++) { + for (i = j = 0; i < q->nr; i++) { struct diff_filepair *p = q->queue[i]; struct diff_filepair *pair_to_free = NULL; + if (!p) { + if (j >= cacheq.nr) + die("Internal error: running out of cacheq."); + diff_q(&outq, cacheq.queue[j++]); + continue; + } + if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { /* * Creation @@ -635,6 +771,8 @@ void diffcore_rename(struct diff_options *options) diff_debug_queue("done copying original", &outq); free(q->queue); + if (cacheq.queue) + free(cacheq.queue); *q = outq; diff_debug_queue("done collapsing", q); diff --git a/log-tree.c b/log-tree.c index cec3c06..a67ef6d 100644 --- a/log-tree.c +++ b/log-tree.c @@ -518,6 +518,7 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit) log.commit = commit; log.parent = NULL; opt->loginfo = &log; + opt->diffopt.commit = commit; shown = log_tree_diff(opt, commit, &log); if (!shown && opt->loginfo && opt->always_show_header) { @@ -527,5 +528,6 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit) } opt->loginfo = NULL; maybe_flush_or_die(stdout, "stdout"); + opt->diffopt.commit = NULL; return shown; } diff --git a/t/t4030-rename-cache.sh b/t/t4030-rename-cache.sh new file mode 100755 index 0000000..0d8390c --- /dev/null +++ b/t/t4030-rename-cache.sh @@ -0,0 +1,55 @@ +#!/bin/sh +# +# Copyright (c) 2008 Nguyen Thai Ngoc Duy +# + +test_description='Test diff rename cache' +. ./test-lib.sh + +cat >expected <<EOF + create mode 100644 c + copy a => d (100%) +EOF +test_expect_success 'setup' ' + for i in a b c + do + echo $i > $i + done + cp a d + A_SHA1=$(git hash-object a) + B_SHA1=$(git hash-object b) + C_SHA1=$(git hash-object c) + D_SHA1=$(git hash-object d) + git add a b + git commit -m first + git add c d + git commit -m second + git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result + test_cmp expected result +' + +cat >expected <<EOF + copy a => c (100%) + copy a => d (100%) +EOF +test_expect_success 'load rename pair cache' ' + P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") && + mkdir -p $(dirname $P) + echo $C_SHA1 $A_SHA1 60000 >> $P + git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result + test_cmp expected result +' + +cat >expected <<EOF + copy a => c (100%) + create mode 100644 d +EOF +test_expect_success 'load create pair cache' ' + P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") && + mkdir -p $(dirname $P) + echo $D_SHA1 0000000000000000000000000000000000000000 0 >> $P + git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result + test_cmp expected result +' + +test_done -- 1.6.0.3.802.g47c38 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html