On Tue, Jan 29, 2013 at 04:16:11AM -0500, Jeff King wrote: > When we are doing a commit traversal that does not need to > look at the commit messages themselves (e.g., rev-list, > merge-base, etc), we spend a lot of time accessing, > decompressing, and parsing the commit objects just to find > the parent and timestamp information. We can make a > space-time tradeoff by caching that information on disk in a > compact, uncompressed format. And this is a (messy) patch on top that avoids storing SHA-1 directly. On my linux-2.6.git (575 MB pack, 73 MB index), .commits file is 5.2 MB and 27 MB with and without my patch respectively. Nice shrinkage. However, performance seems to suffer too. Maybe I do more lookups than necessary, I don't know. I should probably measure the cost of revindex separately. git rev-list --all --quiet on vanilla git: real 0m3.645s user 0m3.556s sys 0m0.080s commit cache without my patch: real 0m0.723s user 0m0.677s sys 0m0.045s and with my patch: real 0m1.338s user 0m1.259s sys 0m0.075s Another point, but not really important at this stage, I think we have memory leak somewhere (lookup_commit??). It used up to 800 MB RES on linux-2.6.git while generating the cache. -- 8< -- diff --git a/cache.h b/cache.h index 1f96f65..8048d5b 100644 --- a/cache.h +++ b/cache.h @@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int); extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t); extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t); extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *); +extern int find_pack_entry_pos(const unsigned char *, struct packed_git *); extern int is_pack_valid(struct packed_git *); extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *); extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep); diff --git a/commit-metapack.c b/commit-metapack.c index 2c19f48..55f7ea9 100644 --- a/commit-metapack.c +++ b/commit-metapack.c @@ -3,11 +3,12 @@ #include "metapack.h" #include "commit.h" #include "sha1-lookup.h" +#include "pack-revindex.h" struct commit_metapack { struct metapack mp; - uint32_t nr; - unsigned char *index; + struct packed_git *pack; + uint32_t first, last; unsigned char *data; struct commit_metapack *next; }; @@ -16,7 +17,7 @@ static struct commit_metapack *commit_metapacks; static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack) { struct commit_metapack *it = xcalloc(1, sizeof(*it)); - uint32_t version; + uint32_t version, nr; if (metapack_init(&it->mp, pack, "commits", &version) < 0) { free(it); @@ -39,22 +40,25 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack) free(it); return NULL; } - memcpy(&it->nr, it->mp.data, 4); - it->nr = ntohl(it->nr); + memcpy(&it->first, it->mp.data, 4); + it->first = ntohl(it->first); + memcpy(&it->last, it->mp.data + 4, 4); + it->last = ntohl(it->last); + nr = it->last - it->first + 1; + it->pack = pack; /* - * We need 84 bytes for each entry: sha1(20), date(4), tree(20), - * parents(40). + * We need 16 bytes for each entry: date(4), tree index(4), + * parent indexes(8). */ - if (it->mp.len < (84 * it->nr + 4)) { + if (it->mp.len < (16 * nr + 8)) { warning("commit metapack for '%s' is truncated", pack->pack_name); metapack_close(&it->mp); free(it); return NULL; } - it->index = it->mp.data + 4; - it->data = it->index + 20 * it->nr; + it->data = it->mp.data + 8; return it; } @@ -81,31 +85,61 @@ static void prepare_commit_metapacks(void) initialized = 1; } +static const unsigned char *idx_to_sha1(struct packed_git *p, + uint32_t nth) +{ + struct revindex_entry *revindex = get_revindex(p); + if (!revindex) + return NULL; + return nth_packed_object_sha1(p, revindex[nth].nr); +} + int commit_metapack(unsigned char *sha1, uint32_t *timestamp, - unsigned char **tree, - unsigned char **parent1, - unsigned char **parent2) + const unsigned char **tree, + const unsigned char **parent1, + const unsigned char **parent2) { struct commit_metapack *p; prepare_commit_metapacks(); for (p = commit_metapacks; p; p = p->next) { unsigned char *data; - int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1); - if (pos < 0) + uint32_t p1, p2; + struct revindex_entry *re, *base; + off_t off; + uint32_t pos; + + base = get_revindex(p->pack); + off = find_pack_entry_one(sha1, p->pack); + if (!off) + continue; + re = find_pack_revindex(p->pack, off); + if (!re) + continue; + pos = re - base; + if (pos < p->first || pos > p->last) continue; /* timestamp(4) + tree(20) + parents(40) */ - data = p->data + 64 * pos; + data = p->data + 16 * (pos - p->first); *timestamp = *(uint32_t *)data; *timestamp = ntohl(*timestamp); + if (!*timestamp) + return -1; data += 4; - *tree = data; - data += 20; - *parent1 = data; - data += 20; - *parent2 = data; + *tree = idx_to_sha1(p->pack, ntohl(*(uint32_t*)data)); + data += 4; + p1 = ntohl(*(uint32_t*)data); + *parent1 = idx_to_sha1(p->pack, p1); + data += 4; + p2 = ntohl(*(uint32_t*)data); + if (p1 == p2) + *parent2 = null_sha1; + else + *parent2 = idx_to_sha1(p->pack, p2); + if (!*tree || !*parent1 || !*parent2) + return -1; return 0; } @@ -113,63 +147,114 @@ int commit_metapack(unsigned char *sha1, return -1; } -static void get_commits(struct metapack_writer *mw, - const unsigned char *sha1, - void *data) +static int get_commits(struct metapack_writer *mw, + uint32_t nth, + int dry_run) { - struct commit_list ***tail = data; + const unsigned char *sha1 = nth_packed_object_sha1(mw->pack, nth); enum object_type type = sha1_object_info(sha1, NULL); struct commit *c; + struct revindex_entry *revindex, *ridx; + int pt, p1, p2 = -1; - if (type != OBJ_COMMIT) - return; + if (type != OBJ_COMMIT) { + if (dry_run) + return -1; + metapack_writer_add_uint32(mw, 0); /* date */ + metapack_writer_add_uint32(mw, 0); /* tree */ + metapack_writer_add_uint32(mw, 0); /* 1st parent */ + metapack_writer_add_uint32(mw, 0); /* 2nd tree */ + return 0; + } c = lookup_commit(sha1); if (!c || parse_commit(c)) die("unable to read commit %s", sha1_to_hex(sha1)); + if (c->buffer) { + free(c->buffer); + c->buffer = NULL; + } + /* * Our fixed-size parent list cannot represent root commits, nor * octopus merges. Just skip those commits, as we can fallback * in those rare cases to reading the actual commit object. */ if (!c->parents || - (c->parents && c->parents->next && c->parents->next->next)) - return; + (c->parents && c->parents->next && c->parents->next->next) || + /* edge commits are out too */ + (pt = find_pack_entry_pos(c->tree->object.sha1, mw->pack)) == -1 || + (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 || + (c->parents->next && + (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) || + /* + * we set the 2nd parent the same as 1st parent as an + * indication that 2nd parent does not exist. Normal + * commits should never have two same parents, but just in + * case.. + */ + p1 == p2 || + /* zero date is reserved to say this is not a valid entry */ + c->date == 0) { + if (dry_run) + return -1; + metapack_writer_add_uint32(mw, 0); /* date */ + metapack_writer_add_uint32(mw, 0); /* tree */ + metapack_writer_add_uint32(mw, 0); /* 1st parent */ + metapack_writer_add_uint32(mw, 0); /* 2nd tree */ + return 0; + } + + if (dry_run) + return 0; + + revindex = get_revindex(mw->pack); - *tail = &commit_list_insert(c, *tail)->next; + metapack_writer_add_uint32(mw, c->date); + ridx = find_pack_revindex(mw->pack, + nth_packed_object_offset(mw->pack, pt)); + metapack_writer_add_uint32(mw, ridx - revindex); + ridx = find_pack_revindex(mw->pack, + nth_packed_object_offset(mw->pack, p1)); + metapack_writer_add_uint32(mw, ridx - revindex); + if (p2 != -1) + ridx = find_pack_revindex(mw->pack, + nth_packed_object_offset(mw->pack, p2)); + metapack_writer_add_uint32(mw, ridx - revindex); + return 0; } void commit_metapack_write(const char *idx) { struct metapack_writer mw; - struct commit_list *commits = NULL, *p; - struct commit_list **tail = &commits; - uint32_t nr = 0; + uint32_t i, first = 0xffffffff, last = 0; + struct revindex_entry *revidx; metapack_writer_init(&mw, idx, "commits", 1); - /* Figure out how many eligible commits we've got in this pack. */ - metapack_writer_foreach(&mw, get_commits, &tail); - for (p = commits; p; p = p->next) - nr++; - metapack_writer_add_uint32(&mw, nr); + packed_git = mw.pack; + revidx = get_revindex(mw.pack); - /* Then write an index of commit sha1s */ - for (p = commits; p; p = p->next) - metapack_writer_add(&mw, p->item->object.sha1, 20); + /* + * Figure out how many eligible commits we've got in this pack. + */ + for (i = 0; i < mw.pack->num_objects; i++) { + int ret = get_commits(&mw, revidx[i].nr, 1); + if (ret == -1) /* not cached */ + continue; + if (i < first) + first = i; + if (i > last) + last = i; + } + + metapack_writer_add_uint32(&mw, first); + metapack_writer_add_uint32(&mw, last); /* Followed by the actual date/tree/parents data */ - for (p = commits; p; p = p->next) { - struct commit *c = p->item; - metapack_writer_add_uint32(&mw, c->date); - metapack_writer_add(&mw, c->tree->object.sha1, 20); - metapack_writer_add(&mw, c->parents->item->object.sha1, 20); - metapack_writer_add(&mw, - c->parents->next ? - c->parents->next->item->object.sha1 : - null_sha1, 20); - } + for (i = first; i <= last; i++) + get_commits(&mw, revidx[i].nr, 0); metapack_writer_finish(&mw); } diff --git a/commit-metapack.h b/commit-metapack.h index 4684573..caf85be 100644 --- a/commit-metapack.h +++ b/commit-metapack.h @@ -3,9 +3,9 @@ int commit_metapack(unsigned char *sha1, uint32_t *timestamp, - unsigned char **tree, - unsigned char **parent1, - unsigned char **parent2); + const unsigned char **tree, + const unsigned char **parent1, + const unsigned char **parent2); void commit_metapack_write(const char *idx_file); diff --git a/pack-revindex.c b/pack-revindex.c index 77a0465..d58dd02 100644 --- a/pack-revindex.c +++ b/pack-revindex.c @@ -111,12 +111,10 @@ static void create_pack_revindex(struct pack_revindex *rix) qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset); } -struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs) +struct revindex_entry *get_revindex(struct packed_git *p) { int num; - int lo, hi; struct pack_revindex *rix; - struct revindex_entry *revindex; if (!pack_revindex_hashsz) init_pack_revindex(); @@ -127,7 +125,13 @@ struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs) rix = &pack_revindex[num]; if (!rix->revindex) create_pack_revindex(rix); - revindex = rix->revindex; + return rix->revindex; +} + +struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs) +{ + int lo, hi; + struct revindex_entry *revindex = get_revindex(p); lo = 0; hi = p->num_objects + 1; diff --git a/pack-revindex.h b/pack-revindex.h index 8d5027a..cea85db 100644 --- a/pack-revindex.h +++ b/pack-revindex.h @@ -6,6 +6,7 @@ struct revindex_entry { unsigned int nr; }; +struct revindex_entry *get_revindex(struct packed_git *p); struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs); void discard_revindex(void); diff --git a/sha1_file.c b/sha1_file.c index 40b2329..1acab8c 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n) } } -off_t find_pack_entry_one(const unsigned char *sha1, - struct packed_git *p) +int find_pack_entry_pos(const unsigned char *sha1, + struct packed_git *p) { const uint32_t *level1_ofs = p->index_data; const unsigned char *index = p->index_data; @@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1, if (!index) { if (open_pack_index(p)) - return 0; + return -1; level1_ofs = p->index_data; index = p->index_data; } @@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1, if (use_lookup) { int pos = sha1_entry_pos(index, stride, 0, lo, hi, p->num_objects, sha1); - if (pos < 0) - return 0; - return nth_packed_object_offset(p, pos); + return pos; } do { @@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1, printf("lo %u hi %u rg %u mi %u\n", lo, hi, hi - lo, mi); if (!cmp) - return nth_packed_object_offset(p, mi); + return mi; if (cmp > 0) hi = mi; else lo = mi+1; } while (lo < hi); - return 0; + return -1; } +off_t find_pack_entry_one(const unsigned char *sha1, + struct packed_git *p) +{ + int pos = find_pack_entry_pos(sha1, p); + if (pos < 0) + return 0; + else + return nth_packed_object_offset(p, pos); +} int is_pack_valid(struct packed_git *p) { /* An already open pack is known to be valid. */ -- 1.8.1.1.459.g5970e58 -- 8< -- -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html