This goes as follows: - Number of tree entries: variable length encoded. Then for each tree entry: - Path component reference: variable length encoded index into the path string dictionary table which also covers the entry mode. To make the overall encoding efficient, the author table is already sorted by usage frequency so the most used names are first and require the shortest index encoding. - SHA1 reference: either variable length encoding of the index into the SHA1 table or the literal SHA1 prefixed by 0 (see add_sha1_ref()). Rationale: all the tree object data is densely encoded while requiring no zlib inflate processing, and all SHA1 references are most likely to be direct indices into the pack index file requiring no SHA1 search. Path filtering can be accomplished on the path index directly without any string comparison during the tree traversal. Still lacking is some kind of delta encoding for multiple tree objects with only small differences between them. But that'll come later. Signed-off-by: Nicolas Pitre <nico@xxxxxxxxxxx> --- packv4-create.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/packv4-create.c b/packv4-create.c index cedbbd9..f46fbd9 100644 --- a/packv4-create.c +++ b/packv4-create.c @@ -408,6 +408,69 @@ bad: return NULL; } +/* + * This converts a canonical tree object buffer into its + * tightly packed representation using the already populated + * and sorted tree_path_table dictionary. The parsing is + * strict so to ensure the canonical version may always be + * regenerated and produce the same hash. + */ +void * conv_to_dict_tree(void *buffer, unsigned long *psize) +{ + unsigned long size = *psize; + unsigned char *in, *out, *end; + struct tree_desc desc; + struct name_entry name_entry; + int nb_entries; + + if (!size) + return NULL; + + /* + * We can't make sure the result will always be smaller. + * The smallest possible entry is "0 x\0<40 byte SHA1>" + * or 44 bytes. The output entry may have a realistic path index + * encoding using up to 3 bytes, and a non indexable SHA1 meaning + * 41 bytes. And the output data already has the and nb_entries + * headers. In practice the output size will be significantly + * smaller but for now let's make it simple. + */ + in = buffer; + out = xmalloc(size); + end = out + size; + buffer = out; + + /* let's count how many entries there are */ + init_tree_desc(&desc, in, size); + nb_entries = 0; + while (tree_entry(&desc, &name_entry)) + nb_entries++; + out = add_number(out, nb_entries); + + init_tree_desc(&desc, in, size); + while (tree_entry(&desc, &name_entry)) { + int pathlen = tree_entry_len(&name_entry); + int index = dict_add_entry(tree_path_table, name_entry.mode, + name_entry.path, pathlen); + if (index < 0) { + error("missing tree dict entry"); + free(buffer); + return NULL; + } + if (end - out < 45) { + unsigned long sofar = out - (unsigned char *)buffer; + buffer = xrealloc(buffer, sofar + 45); + out = (unsigned char *)buffer + sofar; + end = out + 45; + } + out = add_number(out, index); + out = add_sha1_ref(out, name_entry.sha1); + } + + *psize = out - (unsigned char *)buffer; + return buffer; +} + static struct pack_idx_entry *get_packed_object_list(struct packed_git *p) { unsigned i, nr_objects = p->num_objects; -- 1.8.4.22.g54757b7 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html