[PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines

Barret Rhoden <brho@xxxxxxxxxx> · Wed, 3 Apr 2019 12:02:07 -0400

This replaces the heuristic used to identify lines from ignored commits
with one that finds likely candidate lines in the parent's version of
the file.

The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent.  The new function uses
fingerprinting based on the sum of the bitwise-ORed bits in a string.

The fingerprint code and the idea to use them for blame
came from Michael Platings <michael@xxxxxxxxx>.

For each line in the target, guess_line_blames() finds the best line in
the parent, above a magic threshold (1 byte pair, currently).  Ties are
broken by proximity of the parent line number to the target's line.

Here's an example of the difference between algorithms.  Consider a file
with four commits:

	commit-a 11) void new_func_1(void *x, void *y);
	commit-b 12) void new_func_2(void *x, void *y);
	commit-c 13) some_line_c
	commit-d 14) some_line_d

After a commit 'X', we have:

	commit-X 11) void new_func_1(void *x,
	commit-X 12)                 void *y);
	commit-X 13) void new_func_2(void *x,
	commit-X 14)                 void *y);
	commit-c 15) some_line_c
	commit-d 16) some_line_d

When we blame-ignored with the old algorithm, we get:

	commit-a 11) void new_func_1(void *x,
	commit-b 12)                 void *y);
	00000000 13) void new_func_2(void *x,
	00000000 14)                 void *y);
	commit-c 15) some_line_c
	commit-d 16) some_line_d

Where commit-b is blamed for 12 instead of 13.  With the fingerprint
algorithm, we get:

	commit-a 11) void new_func_1(void *x,
	commit-b 12)                 void *y);
	commit-b 13) void new_func_2(void *x,
	commit-b 14)                 void *y);
	commit-c 15) some_line_c
	commit-d 16) some_line_d

Note both lines 12 and 14 are given to commit b.  Their match is above
the FINGERPRINT_THRESHOLD, and they tied.  Specifically, parent lines 11
and 12 both match these lines.  The algorithm chose parent line 12,
since that was closest to the target line numbers of 12 and 14.

If we increase the threshold, say to 10, those two lines won't match,
and will be treated as 'unblamable.'

TODO:
- Is '1' a decent threshold?  Add another user option?

- Can we decrease that FINGERPRINT_LENGTH?  Or do something about the
TODO about using a set data structure?

- How about matching *outside* the parent's diff hunk?  Right now, this
just looks in the parent's hunk for a match.  But a better match may be
elsewhere in the file.  This might happen when the diff has too small of
a -M.  If we do this, then consider caching the fingerprints for a
parent's entire file, since multiple target blame_entries may check the
entire parent's space.  Also, if we do this, we probably need to sort the
parent's blame list (again), since the spliced entries point outside of
the diff hunk's range in the parent's address space.

- If we never intend to match outside the parent's diff hunk, then we
might be able to short-circuit guess_line_blames() when the parent's
chunk length is 0.  Doing this somewhat limits the algorithms, and would
be a performance optimization, which I didn't want to do without numbers
saying we needed it.

- Fix up this commit + message.  I'd be up for splitting it more,
particularly if Michael wants his contributions/fingerprinting in his
own commit.

Suggested-by: Michael Platings <michael@xxxxxxxxx>
Signed-off-by: Barret Rhoden <brho@xxxxxxxxxx>
---
 blame.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/blame.c b/blame.c
index c06cbd906658..50511a300059 100644
--- a/blame.c
+++ b/blame.c
@@ -915,27 +915,109 @@ static int are_lines_adjacent(struct blame_line_tracker *first,
 	       first->s_lno + 1 == second->s_lno;
 }
 
-/*
- * This cheap heuristic assigns lines in the chunk to their relative location in
- * the parent's chunk.  Any additional lines are left with the target.
+/* https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+static int bitcount(uint32_t v)
+{
+	v = v - ((v >> 1) & 0x55555555u);
+	v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u);
+	return (((v + (v >> 4)) & 0xf0f0f0fu) * 0x1010101u) >> 24;
+}
+
+#define FINGERPRINT_LENGTH (8 * 256)
+#define FINGERPRINT_THRESHOLD 1
+/* This is just a bitset indicating which byte pairs are present.
+ * e.g. the string "good goo" has pairs "go", "oo", "od", "d ", " g"
+ * String similarity is calculated as a bitwise or and counting the set bits.
+ *
+ * TODO for the string lengths we typically deal with, this would probably be
+ * implemented more efficiently with a set data structure.
  */
+struct fingerprint {
+	uint32_t bits[FINGERPRINT_LENGTH];
+};
+
+static void get_fingerprint(struct fingerprint *result, const char *line_begin,
+			    const char *line_end)
+{
+	for (const char *p = line_begin; p + 1 < line_end; ++p) {
+		unsigned int c = tolower(*p) | (tolower(*(p + 1)) << 8);
+
+		result->bits[c >> 5] |= 1u << (c & 0x1f);
+	}
+}
+
+static int fingerprint_similarity(const struct fingerprint *a,
+				  const struct fingerprint *b)
+{
+	int intersection = 0;
+
+	for (int i = 0; i < FINGERPRINT_LENGTH; ++i)
+		intersection += bitcount(a->bits[i] & b->bits[i]);
+	return intersection;
+}
+
+static void get_chunk_fingerprints(struct fingerprint *fingerprints,
+				   const char *content,
+				   const int *line_starts,
+				   long chunk_start,
+				   long chunk_length)
+{
+	line_starts += chunk_start;
+	for (int i = 0; i != chunk_length; ++i) {
+		const char *linestart = content + line_starts[i];
+		const char *lineend = content + line_starts[i + 1];
+
+		get_fingerprint(fingerprints + i, linestart, lineend);
+	}
+}
+
 static void guess_line_blames(struct blame_entry *e,
 			      struct blame_origin *parent,
 			      struct blame_origin *target,
 			      int offset, int delta,
 			      struct blame_line_tracker *line_blames)
 {
+	struct fingerprint *fp_parent, *fp_target;
 	int nr_parent_lines = e->num_lines - delta;
 
+	fp_parent = xcalloc(sizeof(struct fingerprint), nr_parent_lines);
+	fp_target = xcalloc(sizeof(struct fingerprint), e->num_lines);
+
+	get_chunk_fingerprints(fp_parent, parent->file.ptr,
+			       parent->line_starts,
+			       e->s_lno + offset, nr_parent_lines);
+	get_chunk_fingerprints(fp_target, target->file.ptr,
+			       target->line_starts,
+			       e->s_lno, e->num_lines);
+
 	for (int i = 0; i < e->num_lines; i++) {
-		if (i < nr_parent_lines) {
+		int best_sim_val = FINGERPRINT_THRESHOLD;
+		int best_sim_idx = -1;
+		int sim;
+
+		for (int j = 0; j < nr_parent_lines; j++) {
+			sim = fingerprint_similarity(&fp_target[i],
+						     &fp_parent[j]);
+			if (sim < best_sim_val)
+				continue;
+			/* Break ties with the closest-to-target line number */
+			if (sim == best_sim_val && best_sim_idx != -1 &&
+			    abs(best_sim_idx - i) < abs(j - i))
+				continue;
+			best_sim_val = sim;
+			best_sim_idx = j;
+		}
+		if (best_sim_idx >= 0) {
 			line_blames[i].is_parent = 1;
-			line_blames[i].s_lno = e->s_lno + i + offset;
+			line_blames[i].s_lno = e->s_lno + offset + best_sim_idx;
 		} else {
 			line_blames[i].is_parent = 0;
 			line_blames[i].s_lno = e->s_lno + i;
 		}
 	}
+
+	free(fp_parent);
+	free(fp_target);
 }
 
 /*
-- 
2.21.0.392.gf8f6787159e-goog