On 2024.12.02 12:18, Jonathan Tan wrote: > Commit c08589efdc (index-pack: repack local links into promisor packs, > 2024-11-01) fixed a bug with what was believed to be a negligible > decrease in performance [1] [2]. But at $DAYJOB, with at least one repo, > it was found that the decrease in performance was very significant. > > Looking at the patch, whenever we parse an object in the packfile to > be indexed, we check the targets of all its outgoing links for its > existence. However, this could be optimized by first collecting all such > targets into an oidset (thus deduplicating them) before checking. Teach > Git to do that. > > On a certain fetch from the aforementioned repo, this improved > performance from approximately 7 hours to 24m47.815s. This number will > be further reduced in a subsequent patch. > > [1] https://lore.kernel.org/git/CAG1j3zGiNMbri8rZNaF0w+yP+6OdMz0T8+8_Wgd1R_p1HzVasg@xxxxxxxxxxxxxx/ > [2] https://lore.kernel.org/git/20241105212849.3759572-1-jonathantanmy@xxxxxxxxxx/ > > Signed-off-by: Jonathan Tan <jonathantanmy@xxxxxxxxxx> > --- > builtin/index-pack.c | 44 ++++++++++++++++++++++---------------------- > 1 file changed, 22 insertions(+), 22 deletions(-) > > diff --git a/builtin/index-pack.c b/builtin/index-pack.c > index 95babdc5ea..8e7d14c17e 100644 > --- a/builtin/index-pack.c > +++ b/builtin/index-pack.c > @@ -155,11 +155,11 @@ static int input_fd, output_fd; > static const char *curr_pack; > > /* > - * local_links is guarded by read_mutex, and record_local_links is read-only in > - * a thread. > + * outgoing_links is guarded by read_mutex, and record_outgoing_links is > + * read-only in a thread. > */ > -static struct oidset local_links = OIDSET_INIT; > -static int record_local_links; > +static struct oidset outgoing_links = OIDSET_INIT; > +static int record_outgoing_links; > > static struct thread_local_data *thread_data; > static int nr_dispatched; We're renaming the oidset and flag because our purpose is now more general. OK. > @@ -812,18 +812,12 @@ static int check_collison(struct object_entry *entry) > return 0; > } > > -static void record_if_local_object(const struct object_id *oid) > +static void record_outgoing_link(const struct object_id *oid) > { > - struct object_info info = OBJECT_INFO_INIT; > - if (oid_object_info_extended(the_repository, oid, &info, 0)) > - /* Missing; assume it is a promisor object */ > - return; > - if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor) > - return; > - oidset_insert(&local_links, oid); > + oidset_insert(&outgoing_links, oid); > } We're now unconditionally recording linked objects, and this logic has been moved below. Looks good. > -static void do_record_local_links(struct object *obj) > +static void do_record_outgoing_links(struct object *obj) > { > if (obj->type == OBJ_TREE) { > struct tree *tree = (struct tree *)obj; > @@ -837,16 +831,16 @@ static void do_record_local_links(struct object *obj) > */ > return; > while (tree_entry_gently(&desc, &entry)) > - record_if_local_object(&entry.oid); > + record_outgoing_link(&entry.oid); > } else if (obj->type == OBJ_COMMIT) { > struct commit *commit = (struct commit *) obj; > struct commit_list *parents = commit->parents; > > for (; parents; parents = parents->next) > - record_if_local_object(&parents->item->object.oid); > + record_outgoing_link(&parents->item->object.oid); > } else if (obj->type == OBJ_TAG) { > struct tag *tag = (struct tag *) obj; > - record_if_local_object(get_tagged_oid(tag)); > + record_outgoing_link(get_tagged_oid(tag)); > } > } > > @@ -896,7 +890,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, > free(has_data); > } > > - if (strict || do_fsck_object || record_local_links) { > + if (strict || do_fsck_object || record_outgoing_links) { > read_lock(); > if (type == OBJ_BLOB) { > struct blob *blob = lookup_blob(the_repository, oid); > @@ -928,8 +922,8 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, > die(_("fsck error in packed object")); > if (strict && fsck_walk(obj, NULL, &fsck_options)) > die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid)); > - if (record_local_links) > - do_record_local_links(obj); > + if (record_outgoing_links) > + do_record_outgoing_links(obj); > > if (obj->type == OBJ_TREE) { > struct tree *item = (struct tree *) obj; > @@ -1781,7 +1775,7 @@ static void repack_local_links(void) > struct object_id *oid; > char *base_name; > > - if (!oidset_size(&local_links)) > + if (!oidset_size(&outgoing_links)) > return; > > base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository)); > @@ -1795,8 +1789,14 @@ static void repack_local_links(void) > if (start_command(&cmd)) > die(_("could not start pack-objects to repack local links")); > > - oidset_iter_init(&local_links, &iter); > + oidset_iter_init(&outgoing_links, &iter); > while ((oid = oidset_iter_next(&iter))) { > + struct object_info info = OBJECT_INFO_INIT; > + if (oid_object_info_extended(the_repository, oid, &info, 0)) > + /* Missing; assume it is a promisor object */ > + continue; > + if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor) > + continue; > if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 || > write_in_full(cmd.in, "\n", 1) < 0) > die(_("failed to feed local object to pack-objects")); We've moved our logic to skip promisor objects here, after potential objects have been recorded to the oidset and thus deduped. Now we `continue` the loop to skip promisor objects, whereas before we had an early return to avoid recording them in the first place. Seems straightforward. > @@ -1899,7 +1899,7 @@ int cmd_index_pack(int argc, > } else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) { > ; /* nothing to do */ > } else if (skip_to_optional_arg(arg, "--promisor", &promisor_msg)) { > - record_local_links = 1; > + record_outgoing_links = 1; > } else if (starts_with(arg, "--threads=")) { > char *end; > nr_threads = strtoul(arg+10, &end, 0); > -- > 2.47.0.338.g60cca15819-goog > >