From: Derrick Stolee <stolee@xxxxxxxxx> In order to more easily compute delta bases among objects that appear at the exact same path, add a --path-walk option to 'git pack-objects'. This option will use the path-walk API instead of the object walk given by the revision machinery. Since objects will be provided in batches representing a common path, those objects can be tested for delta bases immediately instead of waiting for a sort of the full object list by name-hash. This has multiple benefits, including avoiding collisions by name-hash. The objects marked as UNINTERESTING are included in these batches, so we are guaranteeing some locality to find good delta bases. After the individual passes are done on a per-path basis, the default name-hash is used to find other opportunistic delta bases that did not match exactly by the full path name. RFC TODO: It is important to note that this option is inherently incompatible with using a bitmap index. This walk probably also does not work with other advanced features, such as delta islands. Getting ahead of myself, this option compares well with --full-name-hash when the packfile is large enough, but also performs at least as well as the default in all cases that I've seen. RFC TODO: this should probably be recording the batch locations to another list so they could be processed in a second phase using threads. RFC TODO: list some examples of how this outperforms previous pack-objects strategies. (This is coming in later commits that include performance test changes.) Signed-off-by: Derrick Stolee <stolee@xxxxxxxxx> --- builtin/pack-objects.c | 209 ++++++++++++++++++++++++++++++++++------- path-walk.c | 2 +- 2 files changed, 177 insertions(+), 34 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 778be80f564..3d0bb33427d 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -39,6 +39,9 @@ #include "promisor-remote.h" #include "pack-mtimes.h" #include "parse-options.h" +#include "blob.h" +#include "tree.h" +#include "path-walk.h" /* * Objects we are going to pack are collected in the `to_pack` structure. @@ -215,6 +218,7 @@ static int delta_search_threads; static int pack_to_stdout; static int sparse; static int thin; +static int path_walk; static int num_preferred_base; static struct progress *progress_state; @@ -3139,6 +3143,38 @@ static int add_ref_tag(const char *tag UNUSED, const char *referent UNUSED, cons return 0; } +static int should_attempt_deltas(struct object_entry *entry) +{ + if (DELTA(entry)) + /* This happens if we decided to reuse existing + * delta from a pack. "reuse_delta &&" is implied. + */ + return 0; + + if (!entry->type_valid || + oe_size_less_than(&to_pack, entry, 50)) + return 0; + + if (entry->no_try_delta) + return 0; + + if (!entry->preferred_base) { + if (oe_type(entry) < 0) + die(_("unable to get type of object %s"), + oid_to_hex(&entry->idx.oid)); + } else { + if (oe_type(entry) < 0) { + /* + * This object is not found, but we + * don't have to include it anyway. + */ + return 0; + } + } + + return 1; +} + static void prepare_pack(int window, int depth) { struct object_entry **delta_list; @@ -3169,33 +3205,11 @@ static void prepare_pack(int window, int depth) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = to_pack.objects + i; - if (DELTA(entry)) - /* This happens if we decided to reuse existing - * delta from a pack. "reuse_delta &&" is implied. - */ + if (!should_attempt_deltas(entry)) continue; - if (!entry->type_valid || - oe_size_less_than(&to_pack, entry, 50)) - continue; - - if (entry->no_try_delta) - continue; - - if (!entry->preferred_base) { + if (!entry->preferred_base) nr_deltas++; - if (oe_type(entry) < 0) - die(_("unable to get type of object %s"), - oid_to_hex(&entry->idx.oid)); - } else { - if (oe_type(entry) < 0) { - /* - * This object is not found, but we - * don't have to include it anyway. - */ - continue; - } - } delta_list[n++] = entry; } @@ -4110,6 +4124,117 @@ static void mark_bitmap_preferred_tips(void) } } +static inline int is_oid_interesting(struct repository *repo, + struct object_id *oid, + enum object_type type) +{ + if (type == OBJ_TAG) { + struct tag *t = lookup_tag(repo, oid); + return t && !(t->object.flags & UNINTERESTING); + } + + if (type == OBJ_COMMIT) { + struct commit *c = lookup_commit(repo, oid); + return c && !(c->object.flags & UNINTERESTING); + } + + if (type == OBJ_TREE) { + struct tree *t = lookup_tree(repo, oid); + return t && !(t->object.flags & UNINTERESTING); + } + + if (type == OBJ_BLOB) { + struct blob *b = lookup_blob(repo, oid); + return b && !(b->object.flags & UNINTERESTING); + } + + return 0; +} + +static int add_objects_by_path(const char *path, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct object_entry **delta_list; + size_t oe_start = to_pack.nr_objects; + size_t oe_end; + unsigned int sub_list_size; + unsigned int *processed = data; + + /* + * First, add all objects to the packing data, including the ones + * marked UNINTERESTING (translated to 'exclude') as they can be + * used as delta bases. + */ + for (size_t i = 0; i < oids->nr; i++) { + struct object_id *oid = &oids->oid[i]; + int exclude = !is_oid_interesting(the_repository, oid, type); + add_object_entry(oid, type, path, exclude); + } + + oe_end = to_pack.nr_objects; + + /* We can skip delta calculations if it is a no-op. */ + if (oe_end == oe_start || !window) + return 0; + + sub_list_size = 0; + ALLOC_ARRAY(delta_list, oe_end - oe_start); + + for (size_t i = 0; i < oe_end - oe_start; i++) { + struct object_entry *entry = to_pack.objects + oe_start + i; + + if (!should_attempt_deltas(entry)) + continue; + + delta_list[sub_list_size++] = entry; + } + + /* + * Find delta bases among this list of objects that all match the same + * path. This causes the delta compression to be interleaved in the + * object walk, which can lead to confusing progress indicators. This is + * also incompatible with threaded delta calculations. In the future, + * consider creating a list of regions in the full to_pack.objects array + * that could be picked up by the threaded delta computation. + */ + if (sub_list_size && window) { + QSORT(delta_list, sub_list_size, type_size_sort); + find_deltas(delta_list, &sub_list_size, window, depth, processed); + } + + free(delta_list); + return 0; +} + +static void get_object_list_path_walk(struct rev_info *revs) +{ + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int processed = 0; + + info.revs = revs; + + info.revs->tag_objects = 1; + info.tags = 1; + info.commits = 1; + info.trees = 1; + info.blobs = 1; + info.path_fn = add_objects_by_path; + info.path_fn_data = &processed; + + /* + * Allow the --[no-]sparse option to be interesting here, if only + * for testing purposes. Paths with no interesting objects will not + * contribute to the resulting pack, but only create noisy preferred + * base objects. + */ + info.prune_all_uninteresting = sparse; + + if (walk_objects_by_path(&info)) + die(_("failed to pack objects via path-walk")); +} + static void get_object_list(struct rev_info *revs, int ac, const char **av) { struct setup_revision_opt s_r_opt = { @@ -4156,7 +4281,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) warn_on_object_refname_ambiguity = save_warning; - if (use_bitmap_index && !get_object_list_from_bitmap(revs)) + if (use_bitmap_index && !path_walk && !get_object_list_from_bitmap(revs)) return; if (use_delta_islands) @@ -4165,15 +4290,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) if (write_bitmap_index) mark_bitmap_preferred_tips(); - if (prepare_revision_walk(revs)) - die(_("revision walk setup failed")); - mark_edges_uninteresting(revs, show_edge, sparse); - if (!fn_show_object) fn_show_object = show_object; - traverse_commit_list(revs, - show_commit, fn_show_object, - NULL); + + if (path_walk) { + get_object_list_path_walk(revs); + } else { + if (prepare_revision_walk(revs)) + die(_("revision walk setup failed")); + mark_edges_uninteresting(revs, show_edge, sparse); + traverse_commit_list(revs, + show_commit, fn_show_object, + NULL); + } if (unpack_unreachable_expiration) { revs->ignore_missing_links = 1; @@ -4368,6 +4497,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("use the sparse reachability algorithm")), OPT_BOOL(0, "thin", &thin, N_("create thin packs")), + OPT_BOOL(0, "path-walk", &path_walk, + N_("use the path-walk API to walk objects when possible")), OPT_BOOL(0, "shallow", &shallow, N_("create packs suitable for shallow fetches")), OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk, @@ -4448,7 +4579,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) window = 0; strvec_push(&rp, "pack-objects"); - if (thin) { + + if (path_walk && filter_options.choice) { + warning(_("cannot use --filter with --path-walk")); + path_walk = 0; + } + if (path_walk) { + strvec_push(&rp, "--boundary"); + /* + * We must disable the bitmaps because we are removing + * the --objects / --objects-edge[-aggressive] options. + */ + use_bitmap_index = 0; + } else if (thin) { use_internal_rev_list = 1; strvec_push(&rp, shallow ? "--objects-edge-aggressive" diff --git a/path-walk.c b/path-walk.c index 08de29614f7..9391e0579ae 100644 --- a/path-walk.c +++ b/path-walk.c @@ -306,7 +306,7 @@ int walk_objects_by_path(struct path_walk_info *info) /* Track all commits. */ if (info->commits) - ret = info->path_fn("", &commit_list->oids, OBJ_COMMIT, + ret = info->path_fn("initial", &commit_list->oids, OBJ_COMMIT, info->path_fn_data); oid_array_clear(&commit_list->oids); free(commit_list); -- gitgitgadget