Hi Michael -
FYI, here are a few style nits that I changed in my version, based on a
quick scan. Not sure if these are all Git's style, but I think it's
mostly linux-kernel style.
I mention this mostly for future reference - specifically if we keep
separate versions of this code. Hopefully we won't. =)
I also did all the fingerprint prep in fill_origin_blob, which you can
see in an upcoming patch.
On 4/22/19 6:26 PM, michael@xxxxxxxxx wrote:
diff --git a/fuzzy.c b/fuzzy.c
new file mode 100644
index 0000000000..c5b09a0eb7
--- /dev/null
+++ b/fuzzy.c
@@ -0,0 +1,434 @@
+#include "fuzzy.h"
+#include <ctype.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "git-compat-util.h"
+#include "hashmap.h"
+
+struct fingerprint_entry {
+ struct hashmap_entry entry;
+ int count;
+};
+struct fingerprint {
+ struct hashmap map;
+ struct fingerprint_entry *entries;
+};
+
+static void get_fingerprint(struct fingerprint *result,
+ const char *line_begin,
+ const char *line_end) {
^newline here, so { is
at the start of a line. (on all of the functions)
+ unsigned hash;
^int
+ char c0, c1;
+ int map_entry_count = line_end - line_begin - 1;
+ struct fingerprint_entry *entry = xcalloc(map_entry_count,
+ sizeof(struct fingerprint_entry));
+ struct fingerprint_entry *found_entry;
Blank line here, between declarations and code. Did this for all of the
functions.
+ hashmap_init(&result->map, NULL, NULL, map_entry_count);
+ result->entries = entry;
+ for (const char *p = line_begin; p + 1 < line_end; ++p, ++entry) {
^moved this declaration outside the for loop
+ c0 = *p;
+ c1 = *(p + 1);
+ /* Ignore whitespace pairs */
+ if (isspace(c0) && isspace(c1))
+ continue;
+ hash = tolower(c0) | (tolower(c1) << 8);
+ hashmap_entry_init(entry, hash);
+
+ if ((found_entry = hashmap_get(&result->map, entry, NULL))) {
^moved this assignment outside the if ()
+ found_entry->count += 1;
+ }
+ else {
^ made this } else {.
Also in a couple other places below.
+ entry->count = 1;
+ hashmap_add(&result->map, entry);
+ }
+ }
+}
+
+static void free_fingerprint(struct fingerprint *f) {
+ hashmap_free(&f->map, 0);
+ free(f->entries);
+}
+
+static int fingerprint_similarity(struct fingerprint *a,
+ struct fingerprint *b) {
put fingerprint b on the same line as a (within 80 chars). made similar
changes wherever that was possible and looked nice.
+ int intersection = 0;
+ struct hashmap_iter iter;
+ const struct fingerprint_entry *entry_a, *entry_b;
+ hashmap_iter_init(&b->map, &iter);
+
+ while ((entry_b = hashmap_iter_next(&iter))) {
+ if ((entry_a = hashmap_get(&a->map, entry_b, NULL))) {
+ intersection += entry_a->count < entry_b->count ?
+ entry_a->count : entry_b->count;
+ }
+ }
+ return intersection;
+}
+
+static void fingerprint_subtract(struct fingerprint *a,
+ struct fingerprint *b) {
+ struct hashmap_iter iter;
+ struct fingerprint_entry *entry_a;
+ const struct fingerprint_entry *entry_b;
+ hashmap_iter_init(&b->map, &iter);
+
+ while ((entry_b = hashmap_iter_next(&iter))) {
+ if ((entry_a = hashmap_get(&a->map, entry_b, NULL))) {
+ if (entry_a->count <= entry_b->count) {
+ hashmap_remove(&a->map, entry_b, NULL);
+ }
+ else {
+ entry_a->count -= entry_b->count;
+ }
+ }
+ }
+}
+
+static void get_line_fingerprints(struct fingerprint *fingerprints,
+ const char *content,
+ const int *line_starts,
+ long chunk_start,
+ long chunk_length) {
+ int i;
+ const char *linestart, *lineend;
+ line_starts += chunk_start;
+ for (i = 0; i != chunk_length; ++i) {
^ any reason for '!=' versus '<' ?
+ linestart = content + line_starts[i];
+ lineend = content + line_starts[i + 1];
+ get_fingerprint(fingerprints + i, linestart, lineend);
+ }
+}
+
+static int get_closest_local_line(int start_a,
+ int local_line_b,
+ int closest_line_calc_offset1,
+ int closest_line_calc_offset2,
+ int closest_line_calc_numerator,
+ int closest_line_calc_denominator) {
+ return ((local_line_b + closest_line_calc_offset1) * 2 + 1) *
+ closest_line_calc_numerator /
+ (closest_line_calc_denominator * 2) +
+ closest_line_calc_offset2 - start_a;
^ i moved these three lines one space left, so that they
don't line up with the open paren. o/w it looked like they may be
inside the '('.
+}
+
+static int *get_similarity(int *similarities, int max_search_distance_a,
+ int local_line_a, int local_line_b,
+ int closest_local_line_a) {
+ assert(abs(local_line_a - closest_local_line_a) <= max_search_distance_a);
+ return similarities + local_line_a - closest_local_line_a +
+ max_search_distance_a +
+ local_line_b * (max_search_distance_a * 2 + 1);
+}
+
+#define CERTAIN_NOTHING_MATCHES -2
+#define CERTAINTY_NOT_CALCULATED -1
+
+static void find_best_line_matches(const int max_search_distance_a,
+ int start_a,
+ int length_a,
+ int local_line_b,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b,
+ int *similarities,
+ int *certainties,
+ int *second_best_result,
+ int *result,
+ int closest_line_calc_offset1,
+ int closest_line_calc_offset2,
+ int closest_line_calc_numerator,
+ int closest_line_calc_denominator) {
^intense number of arguments. =)
Not sure if there's much to do about that. It does make some of the
callsites busier.
+
+ int i, search_start, search_end, closest_local_line_a, *similarity,
+ best_similarity = 0, second_best_similarity = 0,
+ best_similarity_index = 0, second_best_similarity_index = 0;
+
+ if (certainties[local_line_b] != CERTAINTY_NOT_CALCULATED)
+ return;
+
+ closest_local_line_a = get_closest_local_line(start_a,
+ local_line_b,
+ closest_line_calc_offset1,
+ closest_line_calc_offset2,
+ closest_line_calc_numerator,
+ closest_line_calc_denominator);
+
+ search_start = closest_local_line_a - max_search_distance_a;
+ if (search_start < 0)
+ search_start = 0;
+
+ search_end = closest_local_line_a + max_search_distance_a + 1;
+ if (search_end > length_a)
+ search_end = length_a;
+
+ for (i = search_start; i < search_end; ++i) {
+ similarity = get_similarity(similarities, max_search_distance_a,
+ i, local_line_b,
+ closest_local_line_a);
+ if (*similarity == -1) {
+ *similarity = fingerprint_similarity(
+ fingerprints_b + local_line_b,
+ fingerprints_a + i) *
+ (1000 - abs(i - closest_local_line_a));
+ }
+ if (*similarity > best_similarity) {
+ second_best_similarity = best_similarity;
+ second_best_similarity_index = best_similarity_index;
+ best_similarity = *similarity;
+ best_similarity_index = i;
+ }
+ else if (*similarity > second_best_similarity) {
+ second_best_similarity = *similarity;
+ second_best_similarity_index = i;
+ }
+ }
+
+ if (best_similarity == 0) {
+ certainties[local_line_b] = CERTAIN_NOTHING_MATCHES;
+ result[local_line_b] = -1;
+ }
+ else {
+ certainties[local_line_b] = best_similarity * 2 -
+ second_best_similarity;
+ result[local_line_b] = start_a + best_similarity_index;
+ second_best_result[local_line_b] =
+ start_a + second_best_similarity_index;
+ }
+}
+
+/*
+ * This finds the line that we can match with the most confidence, and
+ * uses it as a partition. It then calls itself on the lines on either side of
+ * that partition. In this way we avoid lines appearing out of order, and
+ * retain a sensible line ordering.
+ */
+static void fuzzy_find_matching_lines_recurse(
+ int max_search_distance_a,
+ int max_search_distance_b,
+ int start_a, int start_b,
+ int length_a, int length_b,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b,
+ int *similarities,
+ int *certainties,
+ int *second_best_result,
+ int *result,
+ int closest_line_calc_offset1,
+ int closest_line_calc_offset2,
+ int closest_line_calc_numerator,
+ int closest_line_calc_denominator) {
+
+ int i, barrier, invalidate_min, invalidate_max, offset_b,
+ second_half_start_a, second_half_start_b,
+ second_half_length_a, second_half_length_b,
+ most_certain_line_a, most_certain_local_line_b = -1,
+ most_certain_line_certainty = -1,
+ closest_local_line_a;
+
+ for (i = 0; i < length_b; ++i) {
+ find_best_line_matches(max_search_distance_a,
+ start_a,
+ length_a,
+ i,
+ fingerprints_a,
+ fingerprints_b,
+ similarities,
+ certainties,
+ second_best_result,
+ result,
+ closest_line_calc_offset1,
+ closest_line_calc_offset2,
+ closest_line_calc_numerator,
+ closest_line_calc_denominator);
+
+ if (certainties[i] > most_certain_line_certainty) {
+ most_certain_line_certainty = certainties[i];
+ most_certain_local_line_b = i;
+ }
+ }
+
+ if (most_certain_local_line_b == -1) {
+ return;
+ }
^ removed the {} for a single-line if block.
+
+ most_certain_line_a = result[most_certain_local_line_b];
+
+ /* Subtract the most certain line's fingerprint in b from the
+ matched fingerprint in a. This means that other lines in b can't also
+ match the same parts of the line in a. */
^ multiline block commits as such:
/*
* foo
* bar
*/
+ fingerprint_subtract(fingerprints_a + most_certain_line_a - start_a,
+ fingerprints_b + most_certain_local_line_b);
+
+
+ /* Invalidate results that may be affected by the choice of pivot. */
+ invalidate_min = most_certain_local_line_b - max_search_distance_b;
+ invalidate_max = most_certain_local_line_b + max_search_distance_b + 1;
+ if (invalidate_min < 0)
+ invalidate_min = 0;
+ if (invalidate_max > length_b)
+ invalidate_max = length_b;
+
+ for (i = invalidate_min; i < invalidate_max; ++i) {
+ closest_local_line_a = get_closest_local_line(
+ start_a, i,
+ closest_line_calc_offset1,
+ closest_line_calc_offset2,
+ closest_line_calc_numerator,
+ closest_line_calc_denominator);
+ *get_similarity(similarities, max_search_distance_a,
+ most_certain_line_a - start_a, i,
+ closest_local_line_a) = -1;
+ }
+
+ barrier = most_certain_line_a;
+
+ for (i = most_certain_local_line_b - 1; i >= invalidate_min; --i) {
+ if (certainties[i] >= 0 &&
+ (result[i] >= barrier || second_best_result[i] >= barrier)) {
over 80 chars (same below)
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ barrier = result[i];
+ invalidate_min = i - max_search_distance_b;
+ if (invalidate_min < 0)
+ invalidate_min = 0;
+ }
+ }
+
+ barrier = most_certain_line_a;
+
+ for (i = most_certain_local_line_b + 1; i < invalidate_max; ++i) {
+ if (certainties[i] >= 0 &&
+ (result[i] <= barrier || second_best_result[i] <= barrier)) {
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ barrier = result[i];
+ invalidate_max = i + max_search_distance_b + 1;
+ if (invalidate_max > length_b)
+ invalidate_max = length_b;
+ }
+ }
+
+
+ if (most_certain_local_line_b > 0) {
+ fuzzy_find_matching_lines_recurse(
+ max_search_distance_a,
+ max_search_distance_b,
+ start_a, start_b,
+ most_certain_line_a + 1 - start_a,
+ most_certain_local_line_b,
+ fingerprints_a, fingerprints_b, similarities,
+ certainties, second_best_result, result,
+ closest_line_calc_offset1, closest_line_calc_offset2,
+ closest_line_calc_numerator,
+ closest_line_calc_denominator);
+ }
+ if (most_certain_local_line_b + 1 < length_b) {
+ second_half_start_a = most_certain_line_a;
+ offset_b = most_certain_local_line_b + 1;
+ second_half_start_b = start_b + offset_b;
+ second_half_length_a =
+ length_a + start_a - second_half_start_a;
+ second_half_length_b =
+ length_b + start_b - second_half_start_b;
+ fuzzy_find_matching_lines_recurse(
+ max_search_distance_a,
+ max_search_distance_b,
+ second_half_start_a, second_half_start_b,
+ second_half_length_a, second_half_length_b,
+ fingerprints_a + second_half_start_a - start_a,
+ fingerprints_b + offset_b,
+ similarities +
+ offset_b * (max_search_distance_a * 2 + 1),
+ certainties + offset_b,
+ second_best_result + offset_b, result + offset_b,
+ closest_line_calc_offset1 + offset_b,
+ closest_line_calc_offset2,
+ closest_line_calc_numerator,
+ closest_line_calc_denominator);
+ }
+}
+
+int *fuzzy_find_matching_lines(const char *content_a,
+ const char *content_b,
+ const int *line_starts_a,
+ const int *line_starts_b,
+ int start_a,
+ int start_b,
+ int length_a,
+ int length_b) {
+
+ int i, *result, *second_best_result,
+ *certainties, *similarities, similarity_count;
+ struct fingerprint *fingerprints_a, *fingerprints_b;
+
+ /* max_search_distance_a means that given a line in `b`, compare it to
+ the line in `a` that is closest to its position, and the lines in `a`
+ that are no greater than max_search_distance_a lines away from the
+ closest line in `a`.
+ max_search_distance_b is an upper bound on the greatest possible
+ distance between lines in `b` such that they will both be compared with
+ the same line in `a` according to max_search_distance_a. */
+ int max_search_distance_a = 10, max_search_distance_b;
+
+ if (max_search_distance_a >= length_a)
+ max_search_distance_a = length_a ? length_a - 1 : 0;
+
+ if (length_a == 0) {
+ max_search_distance_b = 0;
+ }
+ else {
+ max_search_distance_b = ((2 * max_search_distance_a + 1) *
+ length_b - 1) / length_a;
+ }
+
+ result = malloc(sizeof(int) * length_b);
+ second_best_result = malloc(sizeof(int) * length_b);
+ certainties = malloc(sizeof(int) * length_b);
+ similarity_count = length_b * (max_search_distance_a * 2 + 1);
+ similarities = malloc(sizeof(int) * similarity_count);
xcalloc(x, y) instead of malloc (x * y). or at least xmalloc.
+
+ for (i = 0; i < length_b; ++i) {
+ result[i] = -1;
+ second_best_result[i] = -1;
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+
+ for (i = 0; i < similarity_count; ++i) {
+ similarities[i] = -1;
+ }
^ removed {}, as as with the 'if' statements
Barret
+
+ fingerprints_a = xmalloc(sizeof(struct fingerprint) * length_a);
+ fingerprints_b = xmalloc(sizeof(struct fingerprint) * length_b);
+
+ get_line_fingerprints(fingerprints_a, content_a,
+ line_starts_a,
+ start_a, length_a);
+ get_line_fingerprints(fingerprints_b, content_b,
+ line_starts_b,
+ start_b, length_b);
+
+ fuzzy_find_matching_lines_recurse(max_search_distance_a,
+ max_search_distance_b,
+ start_a, start_b,
+ length_a, length_b,
+ fingerprints_a,
+ fingerprints_b,
+ similarities,
+ certainties,
+ second_best_result,
+ result,
+ 0, start_a, length_a, length_b);
+
+ for (i = 0; i < length_b; ++i) {
+ free_fingerprint(fingerprints_b + i);
+ }
+ for (i = 0; i < length_a; ++i) {
+ free_fingerprint(fingerprints_a + i);
+ }
+ free(fingerprints_b);
+ free(fingerprints_a);
+ free(similarities);
+ free(certainties);
+ free(second_best_result);
+
+ return result;
+}
+