[Not sure it is legal to do this. I hope it's legal. It would be great if it were legal. But I really doubt it's legal. FWIW, at least the test suite still passes, even with GIT_TEST_COMMIT_GRAPH=1. Or we have a hole in test coverage...] Our revision walk machinery knows a thing or two about how to traverse the history in an order that is friendlier to the delta base cache. The hand-rolled revision walk in commit-graph.c:close_reachable() is unaware of any such tricks: it is just a simple loop iterating over an array of commits, appending unseen parents to the end. This didn't really matter in the past, because we only accessed commit objects while writing the commit-graph. Now, however, we have modified path Bloom filters, and to fill them we need access to tree objects as well to gather all modified paths, which puts more strain on the caches, and the order in which we traverse the commits becomes relevant. So let's use the regular revision walk machinery (i.e. 'struct rev_info', setup_revisions(), get_revision() and friends) for '--reachable' to speed up writing a commit-graph with modified path Bloom filters. However, stick to the current algorithm when scanning pack files in commits, because passing all packed commits to setup_revisions() might do more harm than good (presumably, I didn't check; scanning all packs for commits can be hopelessly inefficient anyway). '--stdin-commits', too, could benefit from using the revision walk machinery when it's fed with branch tips from 'git for-each-ref', or fare worse when it's fed all commits from 'git rev-list' (didn't check this, either)... so let's just leave it using the current algorithm for now. The table below shows the reduced runtime and max RSS of writing a commit-graph file from scratch with '--reachable' and modified path Bloom filters, i.e. that of: git -c core.modifiedPathBloomFilters=1 commit-graph write --reachable Runtime Max RSS before after before after -------------------------------------------------------------------------- android-base 57.722s 40.113s -30.5% 711804k 697624k -2.0% cmssw 27.977s 25.154s -10.1% 980884k 786196k -19.9% cpython 10.161s 8.956s -11.9% 331736k 328704k -0.9% gcc 114.980s 36.975s -67.9% 561928k 484212k -13.8% gecko-dev 132.557s 97.314s -26.6% 2203544k 2161480k -1.9% git 8.097s 5.215s -35.6% 240564k 189652k -21.2% glibc 4.693s 3.996s -14.9% 215924k 207572k -3.9% homebrew-core 56.661s 56.384s -0.5% 469668k 475596k +1.3% jdk 20.355s 18.936s -7.0% 327244k 322316k -1.5% linux 215.235s 98.991s -54.0% 1549852k 1445028k -6.8% llvm-project 48.659s 30.933s -36.4% 783740k 736988k -6.0% rails 5.804s 5.389s -7.2% 284524k 284528k 0% rust 20.151s 12.921s -35.9% 687840k 664104k -3.5% webkit 31.226s 30.341s -2.8% 2018396k 1902884k -5.7% --- commit-graph.c | 110 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 94 insertions(+), 16 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 4cbfce61bf..1677e4fb3e 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1703,6 +1703,9 @@ static void create_modified_path_bloom_filter( * (a commit can be present in multiple pack files, or multiple * refs can point to the same commit) so check first whether we have * already created a modified path Bloom filter for it. + * + * TODO: with '--reachable' we can now be sure that we only look at + * each commit only once, so this is unnecessary in that code path. */ bfi = modified_path_bloom_filters_peek(&modified_path_bloom_filters, commit); @@ -1891,16 +1894,6 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx) stop_progress(&ctx->progress); } -static int add_ref_to_list(const char *refname, - const struct object_id *oid, - int flags, void *cb_data) -{ - struct string_list *list = (struct string_list *)cb_data; - - string_list_append(list, oid_to_hex(oid)); - return 0; -} - static int fill_oids_from_packs(struct write_commit_graph_context *ctx, struct string_list *pack_indexes) { @@ -2773,14 +2766,99 @@ int write_commit_graph_reachable(const char *obj_dir, enum commit_graph_write_flags flags, const struct split_commit_graph_opts *split_opts) { - struct string_list list = STRING_LIST_INIT_DUP; - int result; + struct write_commit_graph_context *ctx; + struct rev_info revs; + const char *revs_argv[] = { NULL, "--all", NULL }; + struct commit *commit; + int result = 0; + + ctx = init_write_commit_graph_context(obj_dir, flags, split_opts); + if (!ctx) + return 0; + + /* + * Some git commands write a commit-graph in-process at the end, + * prevent their revision walk to interfere with ours. + */ + clear_object_flags(SEEN | ADDED | SHOWN | TOPO_WALK_EXPLORED | + TOPO_WALK_INDEGREE | UNINTERESTING); + + init_revisions(&revs, NULL); + setup_revisions(ARRAY_SIZE(revs_argv) - 1, revs_argv, &revs, NULL); + if (prepare_revision_walk(&revs)) { + error(_("revision walk setup failed")); + result = -1; + goto cleanup; + } + + if (ctx->report_progress) + ctx->progress = start_delayed_progress( + _("Gathering commit info"), + 0); + while ((commit = get_revision(&revs))) { + display_progress(ctx->progress, ctx->oids.nr + 1); + ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc); + oidcpy(&ctx->oids.list[ctx->oids.nr], &commit->object.oid); + ctx->oids.nr++; + + if (!commit->parents) { + create_modified_path_bloom_filter(&ctx->mpbfctx, + commit, 0); + } else { + struct commit_list *parent; + int nth_parent; + + for (parent = commit->parents, nth_parent = 0; + parent; + parent = parent->next, nth_parent++) + create_modified_path_bloom_filter(&ctx->mpbfctx, + commit, nth_parent); + } + } + stop_progress(&ctx->progress); + + if (ctx->oids.nr >= GRAPH_EDGE_LAST_MASK) { + error(_("the commit graph format cannot write %d commits"), ctx->oids.nr); + result = -1; + goto cleanup; + } - for_each_ref(add_ref_to_list, &list); - result = write_commit_graph(obj_dir, NULL, &list, - flags, split_opts); + QSORT(ctx->oids.list, ctx->oids.nr, oid_compare); + + /* + * TODO: the revision walking loop above could fill ctx->commits + * straight away, counting extra edges as well. + */ + ctx->commits.alloc = ctx->oids.nr; + ALLOC_ARRAY(ctx->commits.list, ctx->commits.alloc); + + /* + * Despite its unassuming name, this function removes duplicate + * commits/oids and, worse, it does counts extra edges as well. + */ + copy_oids_to_commits(ctx); + + if (!ctx->commits.nr) + goto cleanup; + + if (ctx->split) { + split_graph_merge_strategy(ctx); - string_list_clear(&list, 0); + merge_commit_graphs(ctx); + } else + ctx->num_commit_graphs_after = 1; + + compute_generation_numbers(ctx); + + result = write_commit_graph_file(ctx); + + if (ctx->split) + mark_commit_graphs(ctx); + + expire_commit_graphs(ctx); + +cleanup: + free_write_commit_graph_context(ctx); return result; } -- 2.27.0.rc1.431.g5c813f95dc