This pair of patches aims at increasing performance of copy detection in blame by avoiding unnecessary comparisons. Note that since I'm new to this code, I might have misunderstood something. There are two cases than I aim to fix: 1) Copy detection is done by comparing all outstanding chunks of the target file to all blobs in the parent. After that, chunks with suitable matches are split, and comparison is repeated again, until there are no new matches. The trouble is, chunks that didn't match the first time, and weren't split, are compared against the same set of blobs again and again. I add a flag to track that. On my test case it decreased blame -C -C time from over 10min to ~6min; 4min with -C80. 2) Chunks are split only if the match scores above a certain threshold. I understand that a split of an entry cannot score more than the entry itself. Thus, it is pointless to even try doing costly comparisons for small entries. (Time goes down to 4min; 2min with -C80) Alexander Gavrilov (2): Avoid rescanning unchanged entries in search for copies. Do not try to detect move/copy for entries below threshold. builtin-blame.c | 33 ++++++++++++++++++++++++++++----- 1 files changed, 28 insertions(+), 5 deletions(-) diff --git a/builtin-blame.c b/builtin-blame.c index b451f6c..01453e2 100644 --- a/builtin-blame.c +++ b/builtin-blame.c @@ -161,6 +161,10 @@ struct blame_entry { */ char guilty; + /* true if the entry has been scanned for copies in the current parent + */ + char scanned; + /* the line number of the first line of this group in the * suspect's file; internally all line numbers are 0 based. */ @@ -1016,7 +1020,8 @@ static int find_move_in_parent(struct scoreboard *sb, while (made_progress) { made_progress = 0; for (e = sb->ent; e; e = e->next) { - if (e->guilty || !same_suspect(e->suspect, target)) + if (e->guilty || !same_suspect(e->suspect, target) || + blame_move_score >= ent_score(sb, e)) continue; find_copy_in_blob(sb, e, parent, split, &file_p); if (split[1].suspect && @@ -1041,6 +1046,7 @@ struct blame_list { */ static struct blame_list *setup_blame_list(struct scoreboard *sb, struct origin *target, + int min_score, int *num_ents_p) { struct blame_entry *e; @@ -1048,12 +1054,16 @@ static struct blame_list *setup_blame_list(struct scoreboard *sb, struct blame_list *blame_list = NULL; for (e = sb->ent, num_ents = 0; e; e = e->next) - if (!e->guilty && same_suspect(e->suspect, target)) + if (!e->scanned && !e->guilty && + same_suspect(e->suspect, target) && + ent_score(sb, e) > min_score) num_ents++; if (num_ents) { blame_list = xcalloc(num_ents, sizeof(struct blame_list)); for (e = sb->ent, i = 0; e; e = e->next) - if (!e->guilty && same_suspect(e->suspect, target)) + if (!e->scanned && !e->guilty && + same_suspect(e->suspect, target) && + ent_score(sb, e) > min_score) blame_list[i++].ent = e; } *num_ents_p = num_ents; @@ -1061,6 +1071,16 @@ static struct blame_list *setup_blame_list(struct scoreboard *sb, } /* + * Reset the scanned status on all entries. + */ +static void reset_scanned_flag(struct scoreboard *sb) +{ + struct blame_entry *e; + for (e = sb->ent; e; e = e->next) + e->scanned = 0; +} + +/* * For lines target is suspected for, see if we can find code movement * across file boundary from the parent commit. porigin is the path * in the parent we already tried. @@ -1078,7 +1098,7 @@ static int find_copy_in_parent(struct scoreboard *sb, struct blame_list *blame_list; int num_ents; - blame_list = setup_blame_list(sb, target, &num_ents); + blame_list = setup_blame_list(sb, target, blame_copy_score, &num_ents); if (!blame_list) return 1; /* nothing remains for this target */ @@ -1152,18 +1172,21 @@ static int find_copy_in_parent(struct scoreboard *sb, split_blame(sb, split, blame_list[j].ent); made_progress = 1; } + else + blame_list[j].ent->scanned = 1; decref_split(split); } free(blame_list); if (!made_progress) break; - blame_list = setup_blame_list(sb, target, &num_ents); + blame_list = setup_blame_list(sb, target, blame_copy_score, &num_ents); if (!blame_list) { retval = 1; break; } } + reset_scanned_flag(sb); diff_flush(&diff_opts); diff_tree_release_paths(&diff_opts); return retval; -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html