This library attempts to find similarities among items efficiently. It treats items as opaque pointers; callers are responsible for providing an item along with its contents. The algorithm is roughly: 1. for each item, create a set of fingerprints for each chunk of the item (where each chunk is either delimited by a newline or is 64 characters, whichever is smaller -- this is the same fingerprint code from diffcore-delta.c). A hash stores a mapping of fingerprints to items, with each fingerprint having at most one 'source' item and one 'dest' item. 2. for each fingerprint with a source and dest item, find the entry with key (source, dest) in a hash table and increment its value by the value of the fingerprint 3. for each (source, dest) pair that had non-zero similarity, report the pair to the caller The program test-similarity is a simple demonstration of the code. It takes two list of files on stdin, with each file separated by newlines and the two lists separated by a blank line. It prints the similarity score of each non-zero pair on stdout. There are a few "interesting" design decisions, which should probably be tweaked: - we store only one source and dest for each fingerprint item. We need to bound this list so that we don't get O(n^2) behavior for common fingerprints. This means that some files won't get "credit" for common fingerprints, which is probably OK, since those fingerprints are probably uninteresting. We could bound at some small number greater than one; one was chosen for speed and simplicity of implementation. We also don't store any overflow bit, so commonly used fingerprints will get assigned to whatever file is found with them last. - the similarity engine hands back absolute, non-normalized scores. That means that bigger items will have bigger scores, and the caller is responsible for normalizing. This also means that the engine cannot adjust normalization to account for chunks which are thrown out by the bounding mentioned above (i.e., if a file is 80 bytes, 64 of which are ignored as "common", then it can at most have 20% similarity (16/80) with another file. This means our normalized rename values (i.e., percentage similarity) will be lower than other algorithms. Signed-off-by: Jeff King <peff@xxxxxxxx> --- .gitignore | 1 + Makefile | 4 +- similarity.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++ similarity.h | 24 ++++++++ test-similarity.c | 54 +++++++++++++++++++ 5 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 similarity.c create mode 100644 similarity.h create mode 100644 test-similarity.c diff --git a/.gitignore b/.gitignore index 62afef2..8ce6026 100644 --- a/.gitignore +++ b/.gitignore @@ -155,6 +155,7 @@ test-dump-cache-tree test-genrandom test-match-trees test-sha1 +test-similarity common-cmds.h *.tar.gz *.dsc diff --git a/Makefile b/Makefile index 2e6fd8f..0568d0d 100644 --- a/Makefile +++ b/Makefile @@ -300,7 +300,7 @@ DIFF_OBJS = \ LIB_OBJS = \ blob.o commit.o connect.o csum-file.o cache-tree.o base85.o \ date.o diff-delta.o entry.o exec_cmd.o ident.o \ - interpolate.o hash.o \ + interpolate.o hash.o similarity.o \ lockfile.o \ patch-ids.o \ object.o pack-check.o pack-write.o patch-delta.o path.o pkt-line.o \ @@ -975,7 +975,7 @@ endif ### Testing rules -TEST_PROGRAMS = test-chmtime$X test-genrandom$X test-date$X test-delta$X test-sha1$X test-match-trees$X test-absolute-path$X +TEST_PROGRAMS = test-chmtime$X test-genrandom$X test-date$X test-delta$X test-sha1$X test-match-trees$X test-absolute-path$X test-similarity$X all:: $(TEST_PROGRAMS) diff --git a/similarity.c b/similarity.c new file mode 100644 index 0000000..0bd135c --- /dev/null +++ b/similarity.c @@ -0,0 +1,152 @@ +#include "cache.h" +#include "similarity.h" + +struct fingerprint_entry { + void *src; + void *dst; + unsigned weight; +}; + +struct score_entry { + void *src; + void *dst; + unsigned score; + struct score_entry *next; +}; + +void similarity_init(struct similarity *s) +{ + init_hash(&s->fingerprints); + init_hash(&s->scores); +} + +static int free_one_fingerprint(void *ve, void *data) +{ + struct fingerprint_entry *e = ve; + free(e); + return 0; +} + +static int free_one_score(void *ve, void *data) +{ + struct score_entry *e = ve; + while (e) { + struct score_entry *next = e->next; + free(e); + e = next; + } + return 0; +} + +void similarity_free(struct similarity *s) +{ + for_each_hash(&s->fingerprints, free_one_fingerprint, NULL); + free_hash(&s->fingerprints); + for_each_hash(&s->scores, free_one_score, NULL); + free_hash(&s->scores); +} + +static void add_fingerprint(struct hash_table *h, unsigned int fp, + int type, void *data, unsigned weight) +{ + void **pos; + struct fingerprint_entry *e; + + pos = insert_hash(fp, h); + if (!*pos) { + e = xmalloc(sizeof(*e)); + e->weight = weight; + e->src = e->dst = NULL; + *pos = e; + } + else + e = *pos; + + if (type == SIMILARITY_SOURCE) + e->src = data; + else + e->dst = data; +} + +void similarity_add(struct similarity *sim, int type, void *data, + const char *buf, unsigned long sz, int is_text) +{ + int n; + unsigned int accum1, accum2, hashval; + + n = 0; + accum1 = accum2 = 0; + while (sz) { + unsigned int c = *buf++; + unsigned int old_1 = accum1; + sz--; + + /* Ignore CR in CRLF sequence if text */ + if (!is_text && c == '\r' && sz && *buf == '\n') + continue; + + accum1 = (accum1 << 7) ^ (accum2 >> 25); + accum2 = (accum2 << 7) ^ (old_1 >> 25); + accum1 += c; + if (++n < 64 && c != '\n') + continue; + hashval = accum1 + accum2 * 0x61; + add_fingerprint(&sim->fingerprints, hashval, type, data, n); + n = 0; + accum1 = accum2 = 0; + } +} + +static unsigned hash_void_pair(void *a, void *b) +{ + return (unsigned)a + (unsigned)b; +} + +static int score_one_entry(void *vfp, void *vsim) +{ + struct fingerprint_entry *fp = vfp; + struct similarity *sim = vsim; + struct score_entry *score; + void **pos; + + if (!fp->src || !fp->dst) + return 0; + + pos = insert_hash(hash_void_pair(fp->src, fp->dst), &sim->scores); + for (score = *pos; score; score = score->next) { + if (score->src == fp->src && score->dst == fp->dst) { + score->score += fp->weight; + return 0; + } + } + + score = xmalloc(sizeof(*score)); + score->src = fp->src; + score->dst = fp->dst; + score->score = fp->weight; + score->next = *pos; + *pos = score; + + return 0; +} + +void similarity_score(struct similarity *s) +{ + for_each_hash(&s->fingerprints, score_one_entry, s); +} + +static int report_one_score(void *ve, void *vdata) +{ + struct score_entry *e; + void (*fn)(void *, void *, unsigned) = vdata; + + for (e = ve; e; e = e->next) + fn(e->src, e->dst, e->score); + return 1; +} + +void similarity_report(struct similarity *s, + void (*fn)(void *, void *, unsigned)) +{ + for_each_hash(&s->scores, report_one_score, fn); +} diff --git a/similarity.h b/similarity.h new file mode 100644 index 0000000..6409ab8 --- /dev/null +++ b/similarity.h @@ -0,0 +1,24 @@ +#ifndef SIMILARITY_H +#define SIMILARITY_H + +#include "hash.h" + +#define SIMILARITY_SOURCE 0 +#define SIMILARITY_DEST 1 + +struct similarity { + struct hash_table fingerprints; + struct hash_table scores; +}; + +void similarity_init(struct similarity *s); +void similarity_free(struct similarity *s); + +void similarity_add(struct similarity *s, int type, void *data, + const char *buf, unsigned long sz, int is_text); + +void similarity_score(struct similarity *s); +void similarity_report(struct similarity *s, + void (*fn)(void *, void *, unsigned)); + +#endif /* SIMILARITY_H */ diff --git a/test-similarity.c b/test-similarity.c new file mode 100644 index 0000000..5947420 --- /dev/null +++ b/test-similarity.c @@ -0,0 +1,54 @@ +#include <string.h> +#include <stdio.h> +#include <errno.h> + +#include "git-compat-util.h" +#include "similarity.h" +#include "strbuf.h" +#include "xdiff-interface.h" + +static void print_rename(void *vone, void *vtwo, unsigned score) { + const char *one = vone, *two = vtwo; + printf("%u %s -> %s\n", score, one, two); +} + +static void add_file(struct similarity *sim, int type, const char *fn) +{ + struct strbuf sb = STRBUF_INIT; + int len; + + len = strbuf_read_file(&sb, fn, 0); + if (len < 0) + die("unable to read %s: %s\n", fn, strerror(errno)); + + similarity_add(sim, type, strdup(fn), + sb.buf, sb.len, buffer_is_binary(sb.buf, sb.len)); + + strbuf_release(&sb); +} + +int main(int argc, char **argv) +{ + struct similarity sim; + struct strbuf line; + + similarity_init(&sim); + strbuf_init(&line, 0); + + while (strbuf_getline(&line, stdin, '\n') != EOF) { + if (!line.len) + break; + add_file(&sim, SIMILARITY_SOURCE, line.buf); + } + while (strbuf_getline(&line, stdin, '\n') != EOF) + add_file(&sim, SIMILARITY_DEST, line.buf); + + strbuf_release(&line); + + similarity_score(&sim); + similarity_report(&sim, print_rename); + + similarity_free(&sim); + + return 0; +} -- 1.5.3.4 - To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html