This goes as follows: - Tree reference: either variable length encoding of the index into the SHA1 table or the literal SHA1 prefixed by 0 (see add_sha1_ref()). - Parent count: variable length encoding of the number of parents. This is normally going to occupy a single byte but doesn't have to. - List of parent references: a list of add_sha1_ref() encoded references, or nothing if the parent count was zero. - Author reference: variable length encoding of an index into the author string dictionary table which also covers the time zone. To make the overall encoding efficient, the author table is already sorted by usage frequency so the most used names are first and require the shortest index encoding. - Author time stamp: variable length encoded. Year 2038 ready! - Committer reference: same as author reference. - Committer time stamp: same as author time stamp. The remainder of the canonical commit object content is then zlib compressed and appended to the above. Rationale: The most important commit object data is densely encoded while requiring no zlib inflate processing, and all SHA1 references are most likely to be direct indices into the pack index file requiring no SHA1 search into the pack index file. Signed-off-by: Nicolas Pitre <nico@xxxxxxxxxxx> --- packv4-create.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/packv4-create.c b/packv4-create.c index bf33d15..cedbbd9 100644 --- a/packv4-create.c +++ b/packv4-create.c @@ -13,6 +13,9 @@ #include "tree-walk.h" #include "pack.h" + +static int pack_compression_level = Z_DEFAULT_COMPRESSION; + struct data_entry { unsigned offset; unsigned size; @@ -289,6 +292,122 @@ static unsigned char *add_sha1_ref(unsigned char *dst, const unsigned char *sha1 return dst + 20; } +/* + * This converts a canonical commit object buffer into its + * tightly packed representation using the already populated + * and sorted commit_name_table dictionary. The parsing is + * strict so to ensure the canonical version may always be + * regenerated and produce the same hash. + */ +void * conv_to_dict_commit(void *buffer, unsigned long *psize) +{ + unsigned long size = *psize; + char *in, *tail, *end; + unsigned char *out; + unsigned char sha1[20]; + int nb_parents, index, tz_val; + unsigned long time; + z_stream stream; + int status; + + /* + * It is guaranteed that the output is always going to be smaller + * than the input. We could even do this conversion in place. + */ + in = buffer; + tail = in + size; + buffer = xmalloc(size); + out = buffer; + + /* parse the "tree" line */ + if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n') + goto bad_data; + if (get_sha1_hex(in + 5, sha1) < 0) + goto bad_data; + in += 46; + out = add_sha1_ref(out, sha1); + + /* count how many "parent" lines */ + nb_parents = 0; + while (in + 48 < tail && !memcmp(in, "parent ", 7) && in[47] == '\n') { + nb_parents++; + in += 48; + } + out = add_number(out, nb_parents); + + /* rewind and parse the "parent" lines */ + in -= 48 * nb_parents; + while (nb_parents--) { + if (get_sha1_hex(in + 7, sha1)) + goto bad_data; + out = add_sha1_ref(out, sha1); + in += 48; + } + + /* parse the "author" line */ + /* it must be at least "author x <x> 0 +0000\n" i.e. 21 chars */ + if (in + 21 >= tail || memcmp(in, "author ", 7)) + goto bad_data; + in += 7; + end = get_nameend_and_tz(in, &tz_val); + if (!end) + goto bad_data; + index = dict_add_entry(commit_name_table, tz_val, in, end - in); + if (index < 0) + goto bad_dict; + out = add_number(out, index); + time = strtoul(end, &end, 10); + if (!end || end[0] != ' ' || end[6] != '\n') + goto bad_data; + out = add_number(out, time); + in = end + 7; + + /* parse the "committer" line */ + /* it must be at least "committer x <x> 0 +0000\n" i.e. 24 chars */ + if (in + 24 >= tail || memcmp(in, "committer ", 7)) + goto bad_data; + in += 10; + end = get_nameend_and_tz(in, &tz_val); + if (!end) + goto bad_data; + index = dict_add_entry(commit_name_table, tz_val, in, end - in); + if (index < 0) + goto bad_dict; + out = add_number(out, index); + time = strtoul(end, &end, 10); + if (!end || end[0] != ' ' || end[6] != '\n') + goto bad_data; + out = add_number(out, time); + in = end + 7; + + /* finally, deflate the remaining data */ + memset(&stream, 0, sizeof(stream)); + deflateInit(&stream, pack_compression_level); + stream.next_in = (unsigned char *)in; + stream.avail_in = tail - in; + stream.next_out = (unsigned char *)out; + stream.avail_out = size - (out - (unsigned char *)buffer); + status = deflate(&stream, Z_FINISH); + end = (char *)stream.next_out; + deflateEnd(&stream); + if (status != Z_STREAM_END) { + error("deflate error status %d", status); + goto bad; + } + + *psize = end - (char *)buffer; + return buffer; + +bad_data: + error("bad commit data"); + goto bad; +bad_dict: + error("bad dict entry"); +bad: + free(buffer); + return NULL; +} + static struct pack_idx_entry *get_packed_object_list(struct packed_git *p) { unsigned i, nr_objects = p->num_objects; -- 1.8.4.22.g54757b7 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html