On Wed, Jan 30, 2013 at 09:16:29PM +0700, Duy Nguyen wrote: > Perhaps we could store abbrev sha-1 instead of full sha-1. Nice > space/time trade-off. Following the on-disk format experiment yesterday, I changed the format to: - a list a _short_ SHA-1 of cached commits - a list of cache entries, each (5 uint32_t) consists of: - uint32_t for the index in .idx sha-1 table to get full SHA-1 of the commit - uint32_t for timestamp - uint32_t for tree, 1st and 2nd parents for the index in .idx table The length of SHA-1 is chosen to be able to unambiguously identify any cached commits. Full SHA-1 check is done after to catch false positives. For linux-2.6, SHA-1 length is 6 bytes, git and many moderate-sized projects are 4 bytes. So it's 26 bytes per commit for linux-2.6.git, or 8MB .commits file. Not as good as revindex approach (5.5MB) but way better than the current one (27MB). And still good enough for caching 2 parents unconditionally. Performance seems to improve a tiny bit, probably because of more compact search space: 0.6s with my patch vs 0.7s without (vs 3.4s without cache). -- 8< -- diff --git a/cache.h b/cache.h index 1f96f65..8048d5b 100644 --- a/cache.h +++ b/cache.h @@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int); extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t); extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t); extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *); +extern int find_pack_entry_pos(const unsigned char *, struct packed_git *); extern int is_pack_valid(struct packed_git *); extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *); extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep); diff --git a/commit-metapack.c b/commit-metapack.c index 2c19f48..c984b8e 100644 --- a/commit-metapack.c +++ b/commit-metapack.c @@ -4,11 +4,21 @@ #include "commit.h" #include "sha1-lookup.h" +struct commit_entry { + uint32_t commit; /* nth_packed_object_sha1 to get own SHA-1 */ + uint32_t timestamp; + uint32_t tree; /* nth_packed_object_sha1 to get tree SHA-1 */ + uint32_t parent1; /* nth_packed_object_sha1 to get 1st parent SHA-1 */ + uint32_t parent2; /* nth_packed_object_sha1 to get 2nd parent SHA-1 */ +}; + struct commit_metapack { struct metapack mp; uint32_t nr; + uint32_t abbrev_len; + struct packed_git *pack; unsigned char *index; - unsigned char *data; + struct commit_entry *data; struct commit_metapack *next; }; static struct commit_metapack *commit_metapacks; @@ -41,20 +51,23 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack) } memcpy(&it->nr, it->mp.data, 4); it->nr = ntohl(it->nr); + memcpy(&it->abbrev_len, it->mp.data + 4, 4); + it->abbrev_len = ntohl(it->abbrev_len); + it->pack = pack; /* - * We need 84 bytes for each entry: sha1(20), date(4), tree(20), - * parents(40). + * We need 20+abbrev_len bytes for each entry: abbrev sha-1, + * date(4), tree index(4), parent indexes(8). */ - if (it->mp.len < (84 * it->nr + 4)) { + if (it->mp.len < ((sizeof(*it->data) + it->abbrev_len) * it->nr + 8)) { warning("commit metapack for '%s' is truncated", pack->pack_name); metapack_close(&it->mp); free(it); return NULL; } - it->index = it->mp.data + 4; - it->data = it->index + 20 * it->nr; + it->index = it->mp.data + 8; + it->data = (struct commit_entry*)(it->index + it->abbrev_len * it->nr); return it; } @@ -83,29 +96,51 @@ static void prepare_commit_metapacks(void) int commit_metapack(unsigned char *sha1, uint32_t *timestamp, - unsigned char **tree, - unsigned char **parent1, - unsigned char **parent2) + const unsigned char **tree, + const unsigned char **parent1, + const unsigned char **parent2) { struct commit_metapack *p; prepare_commit_metapacks(); for (p = commit_metapacks; p; p = p->next) { - unsigned char *data; - int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1); + struct commit_entry *data; + uint32_t p1, p2; + unsigned lo, hi, mi; + int pos; + + /* sha1_entry_pos does not work with abbreviated sha-1 */ + lo = 0; + hi = p->nr; + pos = -1; + do { + unsigned mi = (lo + hi) / 2; + int cmp = memcmp(p->index + mi * p->abbrev_len, sha1, p->abbrev_len); + + if (!cmp) { + pos = mi; + break; + } + if (cmp > 0) + hi = mi; + else + lo = mi+1; + } while (lo < hi); if (pos < 0) continue; - /* timestamp(4) + tree(20) + parents(40) */ - data = p->data + 64 * pos; - *timestamp = *(uint32_t *)data; - *timestamp = ntohl(*timestamp); - data += 4; - *tree = data; - data += 20; - *parent1 = data; - data += 20; - *parent2 = data; + data = p->data + pos; + + /* full sha-1 check again */ + if (hashcmp(nth_packed_object_sha1(p->pack, ntohl(data->commit)), sha1)) + continue; + + *timestamp = ntohl(data->timestamp); + *tree = nth_packed_object_sha1(p->pack, ntohl(data->tree)); + p1 = ntohl(data->parent1); + *parent1 = nth_packed_object_sha1(p->pack, p1); + p2 = ntohl(data->parent2); + *parent2 = p1 == p2 ? null_sha1 : nth_packed_object_sha1(p->pack, p2); return 0; } @@ -113,13 +148,20 @@ int commit_metapack(unsigned char *sha1, return -1; } +struct write_cb { + struct commit_list **tail; + int abbrev_len; + const unsigned char *last_sha1; +}; + static void get_commits(struct metapack_writer *mw, const unsigned char *sha1, void *data) { - struct commit_list ***tail = data; + struct write_cb *write_cb = (struct write_cb *)data; enum object_type type = sha1_object_info(sha1, NULL); struct commit *c; + int p1, p2; if (type != OBJ_COMMIT) return; @@ -128,47 +170,95 @@ static void get_commits(struct metapack_writer *mw, if (!c || parse_commit(c)) die("unable to read commit %s", sha1_to_hex(sha1)); + if (c->buffer) { + free(c->buffer); + c->buffer = NULL; + } + /* * Our fixed-size parent list cannot represent root commits, nor * octopus merges. Just skip those commits, as we can fallback * in those rare cases to reading the actual commit object. */ if (!c->parents || - (c->parents && c->parents->next && c->parents->next->next)) + (c->parents && c->parents->next && c->parents->next->next) || + /* edge commits are out too */ + find_pack_entry_pos(c->tree->object.sha1, mw->pack) == -1 || + (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 || + (c->parents->next && + (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) || + /* + * we set the 2nd parent the same as 1st parent as an + * indication that 2nd parent does not exist. Normal + * commits should never have two same parents, but just in + * case.. + */ + p1 == p2) return; - *tail = &commit_list_insert(c, *tail)->next; + /* + * Make sure we store the abbr sha-1 long enough to + * unambiguously identify any cached commits in the pack. + */ + while (write_cb->abbrev_len < 20 && + write_cb->last_sha1 && + !memcmp(write_cb->last_sha1, sha1, write_cb->abbrev_len)) + write_cb->abbrev_len++; + /* + * A bit sensitive to metapack_writer_foreach. "sha1" must not + * be changed even after this function exits. + */ + write_cb->last_sha1 = sha1; + + write_cb->tail = &commit_list_insert(c, write_cb->tail)->next; } void commit_metapack_write(const char *idx) { struct metapack_writer mw; struct commit_list *commits = NULL, *p; - struct commit_list **tail = &commits; + struct write_cb write_cb; uint32_t nr = 0; metapack_writer_init(&mw, idx, "commits", 1); + write_cb.tail = &commits; + write_cb.abbrev_len = 1; + write_cb.last_sha1 = NULL; + /* Figure out how many eligible commits we've got in this pack. */ - metapack_writer_foreach(&mw, get_commits, &tail); + metapack_writer_foreach(&mw, get_commits, &write_cb); for (p = commits; p; p = p->next) nr++; + metapack_writer_add_uint32(&mw, nr); + metapack_writer_add_uint32(&mw, write_cb.abbrev_len); /* Then write an index of commit sha1s */ for (p = commits; p; p = p->next) - metapack_writer_add(&mw, p->item->object.sha1, 20); + metapack_writer_add(&mw, p->item->object.sha1, write_cb.abbrev_len); /* Followed by the actual date/tree/parents data */ for (p = commits; p; p = p->next) { struct commit *c = p->item; + int pos; + + pos = find_pack_entry_pos(c->object.sha1, mw.pack); + metapack_writer_add_uint32(&mw, pos); + metapack_writer_add_uint32(&mw, c->date); - metapack_writer_add(&mw, c->tree->object.sha1, 20); - metapack_writer_add(&mw, c->parents->item->object.sha1, 20); - metapack_writer_add(&mw, - c->parents->next ? - c->parents->next->item->object.sha1 : - null_sha1, 20); + + pos = find_pack_entry_pos(c->tree->object.sha1, mw.pack); + metapack_writer_add_uint32(&mw, pos); + + pos = find_pack_entry_pos(c->parents->item->object.sha1, mw.pack); + metapack_writer_add_uint32(&mw, pos); + + if (c->parents->next) { + struct object *o = &c->parents->next->item->object; + pos = find_pack_entry_pos(o->sha1, mw.pack); + } + metapack_writer_add_uint32(&mw, pos); } metapack_writer_finish(&mw); diff --git a/commit-metapack.h b/commit-metapack.h index 4684573..caf85be 100644 --- a/commit-metapack.h +++ b/commit-metapack.h @@ -3,9 +3,9 @@ int commit_metapack(unsigned char *sha1, uint32_t *timestamp, - unsigned char **tree, - unsigned char **parent1, - unsigned char **parent2); + const unsigned char **tree, + const unsigned char **parent1, + const unsigned char **parent2); void commit_metapack_write(const char *idx_file); diff --git a/sha1_file.c b/sha1_file.c index 40b2329..1acab8c 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n) } } -off_t find_pack_entry_one(const unsigned char *sha1, - struct packed_git *p) +int find_pack_entry_pos(const unsigned char *sha1, + struct packed_git *p) { const uint32_t *level1_ofs = p->index_data; const unsigned char *index = p->index_data; @@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1, if (!index) { if (open_pack_index(p)) - return 0; + return -1; level1_ofs = p->index_data; index = p->index_data; } @@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1, if (use_lookup) { int pos = sha1_entry_pos(index, stride, 0, lo, hi, p->num_objects, sha1); - if (pos < 0) - return 0; - return nth_packed_object_offset(p, pos); + return pos; } do { @@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1, printf("lo %u hi %u rg %u mi %u\n", lo, hi, hi - lo, mi); if (!cmp) - return nth_packed_object_offset(p, mi); + return mi; if (cmp > 0) hi = mi; else lo = mi+1; } while (lo < hi); - return 0; + return -1; } +off_t find_pack_entry_one(const unsigned char *sha1, + struct packed_git *p) +{ + int pos = find_pack_entry_pos(sha1, p); + if (pos < 0) + return 0; + else + return nth_packed_object_offset(p, pos); +} int is_pack_valid(struct packed_git *p) { /* An already open pack is known to be valid. */ -- 8< -- -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html