On Mon, Oct 23, 2023 at 06:45:06PM -0400, Taylor Blau wrote: > When using merge-tree often within a repository[^1], it is possible to > generate a relatively large number of loose objects, which can result in > degraded performance, and inode exhaustion in extreme cases. > > Building on the functionality introduced in previous commits, the > bulk-checkin machinery now has support to write arbitrary blob and tree > objects which are small enough to be held in-core. We can use this to > write any blob/tree objects generated by ORT into a separate pack > instead of writing them out individually as loose. > > This functionality is gated behind a new `--write-pack` option to > `merge-tree` that works with the (non-deprecated) `--write-tree` mode. > > The implementation is relatively straightforward. There are two spots > within the ORT mechanism where we call `write_object_file()`, one for > content differences within blobs, and another to assemble any new trees > necessary to construct the merge. In each of those locations, > conditionally replace calls to `write_object_file()` with > `index_blob_bulk_checkin_incore()` or `index_tree_bulk_checkin_incore()` > depending on which kind of object we are writing. > > The only remaining task is to begin and end the transaction necessary to > initialize the bulk-checkin machinery, and move any new pack(s) it > created into the main object store. > > [^1]: Such is the case at GitHub, where we run presumptive "test merges" > on open pull requests to see whether or not we can light up the merge > button green depending on whether or not the presumptive merge was > conflicted. > > This is done in response to a number of user-initiated events, > including viewing an open pull request whose last test merge is stale > with respect to the current base and tip of the pull request. As a > result, merge-tree can be run very frequently on large, active > repositories. > > Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx> > --- > Documentation/git-merge-tree.txt | 4 ++ > builtin/merge-tree.c | 5 ++ > merge-ort.c | 42 +++++++++++---- > merge-recursive.h | 1 + > t/t4301-merge-tree-write-tree.sh | 93 ++++++++++++++++++++++++++++++++ > 5 files changed, 136 insertions(+), 9 deletions(-) > > diff --git a/Documentation/git-merge-tree.txt b/Documentation/git-merge-tree.txt > index ffc4fbf7e8..9d37609ef1 100644 > --- a/Documentation/git-merge-tree.txt > +++ b/Documentation/git-merge-tree.txt > @@ -69,6 +69,10 @@ OPTIONS > specify a merge-base for the merge, and specifying multiple bases is > currently not supported. This option is incompatible with `--stdin`. > > +--write-pack:: > + Write any new objects into a separate packfile instead of as > + individual loose objects. > + > [[OUTPUT]] > OUTPUT > ------ > diff --git a/builtin/merge-tree.c b/builtin/merge-tree.c > index a35e0452d6..218442ac9b 100644 > --- a/builtin/merge-tree.c > +++ b/builtin/merge-tree.c > @@ -19,6 +19,7 @@ > #include "tree.h" > #include "config.h" > #include "strvec.h" > +#include "bulk-checkin.h" > > static int line_termination = '\n'; > > @@ -416,6 +417,7 @@ struct merge_tree_options { > int name_only; > int use_stdin; > struct merge_options merge_options; > + int write_pack; > }; > > static int real_merge(struct merge_tree_options *o, > @@ -441,6 +443,7 @@ static int real_merge(struct merge_tree_options *o, > _("not something we can merge")); > > opt.show_rename_progress = 0; > + opt.write_pack = o->write_pack; > > opt.branch1 = branch1; > opt.branch2 = branch2; > @@ -553,6 +556,8 @@ int cmd_merge_tree(int argc, const char **argv, const char *prefix) > N_("specify a merge-base for the merge")), > OPT_STRVEC('X', "strategy-option", &xopts, N_("option=value"), > N_("option for selected merge strategy")), > + OPT_BOOL(0, "write-pack", &o.write_pack, > + N_("write new objects to a pack instead of as loose")), > OPT_END() > }; > > diff --git a/merge-ort.c b/merge-ort.c > index 3653725661..523577d71e 100644 > --- a/merge-ort.c > +++ b/merge-ort.c > @@ -48,6 +48,7 @@ > #include "tree.h" > #include "unpack-trees.h" > #include "xdiff-interface.h" > +#include "bulk-checkin.h" > > /* > * We have many arrays of size 3. Whenever we have such an array, the > @@ -2108,10 +2109,19 @@ static int handle_content_merge(struct merge_options *opt, > if ((merge_status < 0) || !result_buf.ptr) > ret = error(_("failed to execute internal merge")); > > - if (!ret && > - write_object_file(result_buf.ptr, result_buf.size, > - OBJ_BLOB, &result->oid)) > - ret = error(_("unable to add %s to database"), path); > + if (!ret) { > + ret = opt->write_pack > + ? index_blob_bulk_checkin_incore(&result->oid, > + result_buf.ptr, > + result_buf.size, > + path, 1) > + : write_object_file(result_buf.ptr, > + result_buf.size, > + OBJ_BLOB, &result->oid); > + if (ret) > + ret = error(_("unable to add %s to database"), > + path); > + } > > free(result_buf.ptr); > if (ret) > @@ -3597,7 +3607,8 @@ static int tree_entry_order(const void *a_, const void *b_) > b->string, strlen(b->string), bmi->result.mode); > } > > -static int write_tree(struct object_id *result_oid, > +static int write_tree(struct merge_options *opt, > + struct object_id *result_oid, > struct string_list *versions, > unsigned int offset, > size_t hash_size) > @@ -3631,8 +3642,14 @@ static int write_tree(struct object_id *result_oid, > } > > /* Write this object file out, and record in result_oid */ > - if (write_object_file(buf.buf, buf.len, OBJ_TREE, result_oid)) > + ret = opt->write_pack > + ? index_tree_bulk_checkin_incore(result_oid, > + buf.buf, buf.len, "", 1) > + : write_object_file(buf.buf, buf.len, OBJ_TREE, result_oid); > + > + if (ret) > ret = -1; > + > strbuf_release(&buf); > return ret; > } > @@ -3797,8 +3814,8 @@ static int write_completed_directory(struct merge_options *opt, > */ > dir_info->is_null = 0; > dir_info->result.mode = S_IFDIR; > - if (write_tree(&dir_info->result.oid, &info->versions, offset, > - opt->repo->hash_algo->rawsz) < 0) > + if (write_tree(opt, &dir_info->result.oid, &info->versions, > + offset, opt->repo->hash_algo->rawsz) < 0) > ret = -1; > } > > @@ -4332,9 +4349,13 @@ static int process_entries(struct merge_options *opt, > fflush(stdout); > BUG("dir_metadata accounting completely off; shouldn't happen"); > } > - if (write_tree(result_oid, &dir_metadata.versions, 0, > + if (write_tree(opt, result_oid, &dir_metadata.versions, 0, > opt->repo->hash_algo->rawsz) < 0) > ret = -1; > + > + if (opt->write_pack) > + end_odb_transaction(); > + > cleanup: > string_list_clear(&plist, 0); > string_list_clear(&dir_metadata.versions, 0); > @@ -4878,6 +4899,9 @@ static void merge_start(struct merge_options *opt, struct merge_result *result) > */ > strmap_init(&opt->priv->conflicts); > > + if (opt->write_pack) > + begin_odb_transaction(); > + > trace2_region_leave("merge", "allocate/init", opt->repo); > } > > diff --git a/merge-recursive.h b/merge-recursive.h > index 3d3b3e3c29..5c5ff380a8 100644 > --- a/merge-recursive.h > +++ b/merge-recursive.h > @@ -48,6 +48,7 @@ struct merge_options { > unsigned renormalize : 1; > unsigned record_conflict_msgs_as_headers : 1; > const char *msg_header_prefix; > + unsigned write_pack : 1; > > /* internal fields used by the implementation */ > struct merge_options_internal *priv; > diff --git a/t/t4301-merge-tree-write-tree.sh b/t/t4301-merge-tree-write-tree.sh > index b2c8a43fce..d2a8634523 100755 > --- a/t/t4301-merge-tree-write-tree.sh > +++ b/t/t4301-merge-tree-write-tree.sh > @@ -945,4 +945,97 @@ test_expect_success 'check the input format when --stdin is passed' ' > test_cmp expect actual > ' > > +packdir=".git/objects/pack" > + > +test_expect_success 'merge-tree can pack its result with --write-pack' ' > + test_when_finished "rm -rf repo" && > + git init repo && > + > + # base has lines [3, 4, 5] > + # - side adds to the beginning, resulting in [1, 2, 3, 4, 5] > + # - other adds to the end, resulting in [3, 4, 5, 6, 7] > + # > + # merging the two should result in a new blob object containing > + # [1, 2, 3, 4, 5, 6, 7], along with a new tree. > + test_commit -C repo base file "$(test_seq 3 5)" && > + git -C repo branch -M main && > + git -C repo checkout -b side main && > + test_commit -C repo side file "$(test_seq 1 5)" && > + git -C repo checkout -b other main && > + test_commit -C repo other file "$(test_seq 3 7)" && > + > + find repo/$packdir -type f -name "pack-*.idx" >packs.before && > + tree="$(git -C repo merge-tree --write-pack \ > + refs/tags/side refs/tags/other)" && > + blob="$(git -C repo rev-parse $tree:file)" && > + find repo/$packdir -type f -name "pack-*.idx" >packs.after && While we do assert that we write a new packfile, we don't assert whether parts of the written object may have been written as loose objects. Do we want to tighten the checks to verify that? Patrick > + test_must_be_empty packs.before && > + test_line_count = 1 packs.after && > + > + git show-index <$(cat packs.after) >objects && > + test_line_count = 2 objects && > + grep "^[1-9][0-9]* $tree" objects && > + grep "^[1-9][0-9]* $blob" objects > +' > + > +test_expect_success 'merge-tree can write multiple packs with --write-pack' ' > + test_when_finished "rm -rf repo" && > + git init repo && > + ( > + cd repo && > + > + git config pack.packSizeLimit 512 && > + > + test_seq 512 >f && > + > + # "f" contains roughly ~2,000 bytes. > + # > + # Each side ("foo" and "bar") adds a small amount of data at the > + # beginning and end of "base", respectively. > + git add f && > + test_tick && > + git commit -m base && > + git branch -M main && > + > + git checkout -b foo main && > + { > + echo foo && cat f > + } >f.tmp && > + mv f.tmp f && > + git add f && > + test_tick && > + git commit -m foo && > + > + git checkout -b bar main && > + echo bar >>f && > + git add f && > + test_tick && > + git commit -m bar && > + > + find $packdir -type f -name "pack-*.idx" >packs.before && > + # Merging either side should result in a new object which is > + # larger than 1M, thus the result should be split into two > + # separate packs. > + tree="$(git merge-tree --write-pack \ > + refs/heads/foo refs/heads/bar)" && > + blob="$(git rev-parse $tree:f)" && > + find $packdir -type f -name "pack-*.idx" >packs.after && > + > + test_must_be_empty packs.before && > + test_line_count = 2 packs.after && > + for idx in $(cat packs.after) > + do > + git show-index <$idx || return 1 > + done >objects && > + > + # The resulting set of packs should contain one copy of both > + # objects, each in a separate pack. > + test_line_count = 2 objects && > + grep "^[1-9][0-9]* $tree" objects && > + grep "^[1-9][0-9]* $blob" objects > + > + ) > +' > + > test_done > -- > 2.42.0.425.g963d08ddb3.dirty
Attachment:
signature.asc
Description: PGP signature