On Wed, Apr 03, 2019 at 12:02:07PM -0400, Barret Rhoden wrote: > diff --git a/blame.c b/blame.c > index c06cbd906658..50511a300059 100644 > --- a/blame.c > +++ b/blame.c > @@ -915,27 +915,109 @@ static int are_lines_adjacent(struct blame_line_tracker *first, > first->s_lno + 1 == second->s_lno; > } > > -/* > - * This cheap heuristic assigns lines in the chunk to their relative location in > - * the parent's chunk. Any additional lines are left with the target. > +/* https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */ > +static int bitcount(uint32_t v) > +{ > + v = v - ((v >> 1) & 0x55555555u); > + v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u); > + return (((v + (v >> 4)) & 0xf0f0f0fu) * 0x1010101u) >> 24; > +} > + > +#define FINGERPRINT_LENGTH (8 * 256) > +#define FINGERPRINT_THRESHOLD 1 > +/* This is just a bitset indicating which byte pairs are present. > + * e.g. the string "good goo" has pairs "go", "oo", "od", "d ", " g" > + * String similarity is calculated as a bitwise or and counting the set bits. > + * > + * TODO for the string lengths we typically deal with, this would probably be > + * implemented more efficiently with a set data structure. > */ > +struct fingerprint { > + uint32_t bits[FINGERPRINT_LENGTH]; > +}; > + > +static void get_fingerprint(struct fingerprint *result, const char *line_begin, > + const char *line_end) > +{ > + for (const char *p = line_begin; p + 1 < line_end; ++p) { We still stick to C89, which doesn't support for loop initial declarations yet. Please declare the loop variable as a regular local variable. This also applies to the several 'for (int i = 0; ...)' loops in the functions below. > + unsigned int c = tolower(*p) | (tolower(*(p + 1)) << 8); > + > + result->bits[c >> 5] |= 1u << (c & 0x1f); > + } > +} > + > +static int fingerprint_similarity(const struct fingerprint *a, > + const struct fingerprint *b) > +{ > + int intersection = 0; > + > + for (int i = 0; i < FINGERPRINT_LENGTH; ++i) > + intersection += bitcount(a->bits[i] & b->bits[i]); > + return intersection; > +} > + > +static void get_chunk_fingerprints(struct fingerprint *fingerprints, > + const char *content, > + const int *line_starts, > + long chunk_start, > + long chunk_length) > +{ > + line_starts += chunk_start; > + for (int i = 0; i != chunk_length; ++i) { > + const char *linestart = content + line_starts[i]; > + const char *lineend = content + line_starts[i + 1]; > + > + get_fingerprint(fingerprints + i, linestart, lineend); > + } > +} > + > static void guess_line_blames(struct blame_entry *e, > struct blame_origin *parent, > struct blame_origin *target, > int offset, int delta, > struct blame_line_tracker *line_blames) > { > + struct fingerprint *fp_parent, *fp_target; > int nr_parent_lines = e->num_lines - delta; > > + fp_parent = xcalloc(sizeof(struct fingerprint), nr_parent_lines); > + fp_target = xcalloc(sizeof(struct fingerprint), e->num_lines); > + > + get_chunk_fingerprints(fp_parent, parent->file.ptr, > + parent->line_starts, > + e->s_lno + offset, nr_parent_lines); > + get_chunk_fingerprints(fp_target, target->file.ptr, > + target->line_starts, > + e->s_lno, e->num_lines); > + > for (int i = 0; i < e->num_lines; i++) { > - if (i < nr_parent_lines) { > + int best_sim_val = FINGERPRINT_THRESHOLD; > + int best_sim_idx = -1; > + int sim; > + > + for (int j = 0; j < nr_parent_lines; j++) { > + sim = fingerprint_similarity(&fp_target[i], > + &fp_parent[j]); > + if (sim < best_sim_val) > + continue; > + /* Break ties with the closest-to-target line number */ > + if (sim == best_sim_val && best_sim_idx != -1 && > + abs(best_sim_idx - i) < abs(j - i)) > + continue; > + best_sim_val = sim; > + best_sim_idx = j; > + } > + if (best_sim_idx >= 0) { > line_blames[i].is_parent = 1; > - line_blames[i].s_lno = e->s_lno + i + offset; > + line_blames[i].s_lno = e->s_lno + offset + best_sim_idx; > } else { > line_blames[i].is_parent = 0; > line_blames[i].s_lno = e->s_lno + i; > } > } > + > + free(fp_parent); > + free(fp_target); > } > > /* > -- > 2.21.0.392.gf8f6787159e-goog >