(Previously based on 'eb/limit-bulk-checkin-to-blobs', which has since been merged. This series is now based on the tip of 'master', which is a9ecda2788 (The eighteenth batch, 2023-10-13) at the time of writing). This series implements support for a new merge-tree option, `--write-pack`, which causes any newly-written objects to be packed together instead of being stored individually as loose. Much is unchanged since last time, except for a small tweak to one of the commit messages in response to feedback from Eric W. Biederman. The series has also been rebased onto 'master', which had a couple of conflicts that I resolved pertaining to: - 9eb5419799 (bulk-checkin: only support blobs in index_bulk_checkin, 2023-09-26) - e0b8c84240 (treewide: fix various bugs w/ OpenSSL 3+ EVP API, 2023-09-01) They were mostly trivial resolutions, and the results can be viewed in the range-diff included below. (From last time: the motivating use-case behind these changes is to better support repositories who invoke merge-tree frequently, generating a potentially large number of loose objects, resulting in a possible adverse effect on performance.) Thanks in advance for any review! Taylor Blau (7): bulk-checkin: factor out `format_object_header_hash()` bulk-checkin: factor out `prepare_checkpoint()` bulk-checkin: factor out `truncate_checkpoint()` bulk-checkin: factor our `finalize_checkpoint()` bulk-checkin: introduce `index_blob_bulk_checkin_incore()` bulk-checkin: introduce `index_tree_bulk_checkin_incore()` builtin/merge-tree.c: implement support for `--write-pack` Documentation/git-merge-tree.txt | 4 + builtin/merge-tree.c | 5 + bulk-checkin.c | 258 ++++++++++++++++++++++++++----- bulk-checkin.h | 8 + merge-ort.c | 42 +++-- merge-recursive.h | 1 + t/t4301-merge-tree-write-tree.sh | 93 +++++++++++ 7 files changed, 363 insertions(+), 48 deletions(-) Range-diff against v1: 1: 37f4072815 ! 1: edf1cbafc1 bulk-checkin: factor out `format_object_header_hash()` @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *stat } +static void format_object_header_hash(const struct git_hash_algo *algop, -+ git_hash_ctx *ctx, enum object_type type, ++ git_hash_ctx *ctx, ++ struct hashfile_checkpoint *checkpoint, ++ enum object_type type, + size_t size) +{ + unsigned char header[16384]; @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *stat + + algop->init_fn(ctx); + algop->update_fn(ctx, header, header_len); ++ algop->init_fn(&checkpoint->ctx); +} + static int deflate_blob_to_pack(struct bulk_checkin_packfile *state, @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st - OBJ_BLOB, size); - the_hash_algo->init_fn(&ctx); - the_hash_algo->update_fn(&ctx, obuf, header_len); -+ format_object_header_hash(the_hash_algo, &ctx, OBJ_BLOB, size); +- the_hash_algo->init_fn(&checkpoint.ctx); ++ format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB, ++ size); /* Note: idx is non-NULL when we are writing */ if ((flags & HASH_WRITE_OBJECT) != 0) 2: 9cc1f3014a ! 2: b3f89d5853 bulk-checkin: factor out `prepare_checkpoint()` @@ Commit message ## bulk-checkin.c ## @@ bulk-checkin.c: static void format_object_header_hash(const struct git_hash_algo *algop, - algop->update_fn(ctx, header, header_len); + algop->init_fn(&checkpoint->ctx); } +static void prepare_checkpoint(struct bulk_checkin_packfile *state, 3: f392ed2211 = 3: abe4fb0a59 bulk-checkin: factor out `truncate_checkpoint()` 4: 9c6ca564ad = 4: 0b855a6eb7 bulk-checkin: factor our `finalize_checkpoint()` 5: 30ca7334c7 ! 5: 239bf39bfb bulk-checkin: introduce `index_blob_bulk_checkin_incore()` @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *st +static int deflate_obj_contents_to_pack_incore(struct bulk_checkin_packfile *state, + git_hash_ctx *ctx, ++ struct hashfile_checkpoint *checkpoint, + struct object_id *result_oid, + const void *buf, size_t size, + enum object_type type, + const char *path, unsigned flags) +{ -+ struct hashfile_checkpoint checkpoint = {0}; + struct pack_idx_entry *idx = NULL; + off_t already_hashed_to = 0; + @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *st + CALLOC_ARRAY(idx, 1); + + while (1) { -+ prepare_checkpoint(state, &checkpoint, idx, flags); ++ prepare_checkpoint(state, checkpoint, idx, flags); + if (!stream_obj_to_pack_incore(state, ctx, &already_hashed_to, + buf, size, type, path, flags)) + break; -+ truncate_checkpoint(state, &checkpoint, idx); ++ truncate_checkpoint(state, checkpoint, idx); + } + -+ finalize_checkpoint(state, ctx, &checkpoint, idx, result_oid); ++ finalize_checkpoint(state, ctx, checkpoint, idx, result_oid); + + return 0; +} @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *st + const char *path, unsigned flags) +{ + git_hash_ctx ctx; ++ struct hashfile_checkpoint checkpoint = {0}; + -+ format_object_header_hash(the_hash_algo, &ctx, OBJ_BLOB, size); ++ format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB, ++ size); + -+ return deflate_obj_contents_to_pack_incore(state, &ctx, result_oid, -+ buf, size, OBJ_BLOB, path, -+ flags); ++ return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint, ++ result_oid, buf, size, ++ OBJ_BLOB, path, flags); +} + static int deflate_blob_to_pack(struct bulk_checkin_packfile *state, 6: cb0f79cabb ! 6: 57613807d8 bulk-checkin: introduce `index_tree_bulk_checkin_incore()` @@ Commit message Within `deflate_tree_to_pack_incore()`, the changes should be limited to something like: + struct strbuf converted = STRBUF_INIT; if (the_repository->compat_hash_algo) { - struct strbuf converted = STRBUF_INIT; if (convert_object_file(&compat_obj, the_repository->hash_algo, the_repository->compat_hash_algo, ...) < 0) @@ Commit message format_object_header_hash(the_repository->compat_hash_algo, OBJ_TREE, size); - - strbuf_release(&converted); } + /* compute the converted tree's hash using the compat algorithm */ + strbuf_release(&converted); , assuming related changes throughout the rest of the bulk-checkin machinery necessary to update the hash of the converted object, which @@ Commit message ## bulk-checkin.c ## @@ bulk-checkin.c: static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state, - flags); + OBJ_BLOB, path, flags); } +static int deflate_tree_to_pack_incore(struct bulk_checkin_packfile *state, @@ bulk-checkin.c: static int deflate_blob_to_pack_incore(struct bulk_checkin_packf + const char *path, unsigned flags) +{ + git_hash_ctx ctx; ++ struct hashfile_checkpoint checkpoint = {0}; + -+ format_object_header_hash(the_hash_algo, &ctx, OBJ_TREE, size); ++ format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_TREE, ++ size); + -+ return deflate_obj_contents_to_pack_incore(state, &ctx, result_oid, -+ buf, size, OBJ_TREE, path, -+ flags); ++ return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint, ++ result_oid, buf, size, ++ OBJ_TREE, path, flags); +} + static int deflate_blob_to_pack(struct bulk_checkin_packfile *state, 7: e969210145 ! 7: f21400f56c builtin/merge-tree.c: implement support for `--write-pack` @@ merge-ort.c * We have many arrays of size 3. Whenever we have such an array, the @@ merge-ort.c: static int handle_content_merge(struct merge_options *opt, if ((merge_status < 0) || !result_buf.ptr) - ret = err(opt, _("Failed to execute internal merge")); + ret = error(_("failed to execute internal merge")); - if (!ret && - write_object_file(result_buf.ptr, result_buf.size, - OBJ_BLOB, &result->oid)) -- ret = err(opt, _("Unable to add %s to database"), -- path); +- ret = error(_("unable to add %s to database"), path); + if (!ret) { + ret = opt->write_pack + ? index_blob_bulk_checkin_incore(&result->oid, @@ merge-ort.c: static int handle_content_merge(struct merge_options *opt, + result_buf.size, + OBJ_BLOB, &result->oid); + if (ret) -+ ret = err(opt, _("Unable to add %s to database"), -+ path); ++ ret = error(_("unable to add %s to database"), ++ path); + } free(result_buf.ptr); -- 2.42.0.405.gdb2a2f287e