On Wed, May 29, 2013 at 02:20:07AM -0400, Jeff King wrote: > In the best case, we compute no patch-ids at all. And even for the > average case, I'd expect our lazy calculation to only have to compute a > handful of ids. Here is a not-well-tested version of the idea. I tried to contain the changes to patch-ids.c, though we may also be able to simplify the --cherry code, too. Here are my timings compared to stock git and to yours (all are best-of-five, "git log --cherry $revs"): revs=origin/master...origin/jk/submodule-subdirectory-ok stock | you | me ------------------------------- real 0m0.501s | 0m0.078s | 0m0.098s user 0m0.480s | 0m0.056s | 0m0.084s sys 0m0.016s | 0m0.020s | 0m0.012s revs=origin/next...origin/pu stock | you | me ------------------------------- real 0m0.857s | 0m0.847s | 0m0.519s user 0m0.828s | 0m0.812s | 0m0.488s sys 0m0.024s | 0m0.028s | 0m0.024s So it performs slightly less well on the small case, and a bit better on the large case. I think part of the problem is that when we do have a "loose" hit, we end up re-doing the tree diff to find the strict, which involves re-inflating the trees. It's unavoidable for the lazy entries unless we want to cache the whole diff_queued_diff, but for the non-lazy entries we should be able to do the strict patch-id incrementally. We just need to improve the function interfaces. --- diff.c | 15 ++++- diff.h | 2 +- patch-ids.c | 117 +++++++++++++++++++---------------- patch-ids.h | 11 ++-- 4 files changed, 82 insertions(+), 63 deletions(-) diff --git a/diff.c b/diff.c index f0b3e7c..3b55788 100644 --- a/diff.c +++ b/diff.c @@ -4233,7 +4233,8 @@ static void patch_id_consume(void *priv, char *line, unsigned long len) } /* returns 0 upon success, and writes result into sha1 */ -static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1) +static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1, + int loose) { struct diff_queue_struct *q = &diff_queued_diff; int i; @@ -4266,6 +4267,12 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1) if (DIFF_PAIR_UNMERGED(p)) continue; + if (loose) { + git_SHA1_Update(&ctx, p->one->path, strlen(p->one->path)); + git_SHA1_Update(&ctx, p->two->path, strlen(p->two->path)); + continue; + } + diff_fill_sha1_info(p->one); diff_fill_sha1_info(p->two); if (fill_mmfile(&mf1, p->one) < 0 || @@ -4323,11 +4330,13 @@ int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1) return 0; } -int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1) +int diff_flush_patch_id(struct diff_options *options, + unsigned char *sha1, + int loose) { struct diff_queue_struct *q = &diff_queued_diff; int i; - int result = diff_get_patch_id(options, sha1); + int result = diff_get_patch_id(options, sha1, loose); for (i = 0; i < q->nr; i++) diff_free_filepair(q->queue[i]); diff --git a/diff.h b/diff.h index 78b4091..b018aaf 100644 --- a/diff.h +++ b/diff.h @@ -320,7 +320,7 @@ extern int do_diff_cache(const unsigned char *, struct diff_options *); extern int run_diff_index(struct rev_info *revs, int cached); extern int do_diff_cache(const unsigned char *, struct diff_options *); -extern int diff_flush_patch_id(struct diff_options *, unsigned char *); +extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int loose); extern int diff_result_code(struct diff_options *, int); diff --git a/patch-ids.c b/patch-ids.c index bc8a28f..3a83ee6 100644 --- a/patch-ids.c +++ b/patch-ids.c @@ -5,7 +5,7 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options, #include "patch-ids.h" static int commit_patch_id(struct commit *commit, struct diff_options *options, - unsigned char *sha1) + unsigned char *sha1, int loose) { if (commit->parents) diff_tree_sha1(commit->parents->item->object.sha1, @@ -13,27 +13,9 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options, else diff_root_tree_sha1(commit->object.sha1, "", options); diffcore_std(options); - return diff_flush_patch_id(options, sha1); + return diff_flush_patch_id(options, sha1, loose); } -static const unsigned char *patch_id_access(size_t index, void *table) -{ - struct patch_id **id_table = table; - return id_table[index]->patch_id; -} - -static int patch_pos(struct patch_id **table, int nr, const unsigned char *id) -{ - return sha1_pos(id, table, nr, patch_id_access); -} - -#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */ -struct patch_id_bucket { - struct patch_id_bucket *next; - int nr; - struct patch_id bucket[BUCKET_SIZE]; -}; - int init_patch_ids(struct patch_ids *ids) { memset(ids, 0, sizeof(*ids)); @@ -43,56 +25,83 @@ static struct patch_id *add_commit(struct commit *commit, return 0; } +static int free_each_patch_id(void *patch_id, void *data) +{ + free(patch_id); + return 0; +} + int free_patch_ids(struct patch_ids *ids) { - struct patch_id_bucket *next, *patches; - - free(ids->table); - for (patches = ids->patches; patches; patches = next) { - next = patches->next; - free(patches); - } + for_each_hash(&ids->table, free_each_patch_id, NULL); + free_hash(&ids->table); return 0; } +static inline unsigned long hash_sha1(const unsigned char *sha1) +{ + unsigned int hash; + memcpy(&hash, sha1, sizeof(hash)); + return hash; +} + static struct patch_id *add_commit(struct commit *commit, struct patch_ids *ids, int no_add) { - struct patch_id_bucket *bucket; - struct patch_id *ent; - unsigned char sha1[20]; - int pos; + struct patch_id *p; + unsigned char loose[20], strict[20]; + unsigned long hash; + void **pos; - if (commit_patch_id(commit, &ids->diffopts, sha1)) + if (commit_patch_id(commit, &ids->diffopts, loose, 1)) return NULL; - pos = patch_pos(ids->table, ids->nr, sha1); - if (0 <= pos) - return ids->table[pos]; + hash = hash_sha1(loose); + + p = lookup_hash(hash, &ids->table); + for (; p; p = p->next) { + /* Skip collisions from our reduced hash. */ + if (hashcmp(loose, p->loose)) + continue; + + /* + * We have a real loose match; lazily load and cache the strict + * id to compare. + */ + if (is_null_sha1(p->strict)) { + if (commit_patch_id(p->commit, &ids->diffopts, + p->strict, 0)) + return NULL; + } + if (commit_patch_id(commit, &ids->diffopts, strict, 0)) + return NULL; + + /* + * If the strict ones match, we do not need to look farther; + * the patch-id is here. + */ + if (!hashcmp(strict, p->strict)) + return p; + } + + /* + * Otherwise, we may have a loose but not strict match, or even no + * loose match at all. Now we can add the new entry (or return failure + * to look it up). + */ if (no_add) return NULL; - pos = -1 - pos; + p = xcalloc(1, sizeof(*p)); + p->commit = commit; + hashcpy(p->loose, loose); - bucket = ids->patches; - if (!bucket || (BUCKET_SIZE <= bucket->nr)) { - bucket = xcalloc(1, sizeof(*bucket)); - bucket->next = ids->patches; - ids->patches = bucket; + pos = insert_hash(hash, p, &ids->table); + if (pos) { + p->next = *pos; + *pos = p; } - ent = &bucket->bucket[bucket->nr++]; - hashcpy(ent->patch_id, sha1); - - if (ids->alloc <= ids->nr) { - ids->alloc = alloc_nr(ids->nr); - ids->table = xrealloc(ids->table, sizeof(ent) * ids->alloc); - } - if (pos < ids->nr) - memmove(ids->table + pos + 1, ids->table + pos, - sizeof(ent) * (ids->nr - pos)); - ids->nr++; - ids->table[pos] = ent; - return ids->table[pos]; + return p; } struct patch_id *has_commit_patch_id(struct commit *commit, diff --git a/patch-ids.h b/patch-ids.h index c8c7ca1..c40ff39 100644 --- a/patch-ids.h +++ b/patch-ids.h @@ -2,15 +2,16 @@ struct patch_ids { #define PATCH_IDS_H struct patch_id { - unsigned char patch_id[20]; - char seen; + struct commit *commit; + unsigned char loose[20]; + unsigned char strict[20]; + unsigned seen:1; + struct patch_id *next; }; struct patch_ids { struct diff_options diffopts; - int nr, alloc; - struct patch_id **table; - struct patch_id_bucket *patches; + struct hash_table table; }; int init_patch_ids(struct patch_ids *); -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html