From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> We used to call "get_data()" in "unpack_non_delta_entry()" to read the entire contents of a blob object, no matter how big it is. This implementation may consume all the memory and cause OOM. By implementing a zstream version of input_stream interface, we can use a small fixed buffer for "unpack_non_delta_entry()". However, unpack non-delta objects from a stream instead of from an entrie buffer will have 10% performance penalty. $ hyperfine \ --setup \ 'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \ --prepare 'rm -rf dest.git && git init --bare dest.git' \ ... Summary './git -C dest.git -c core.bigFileThreshold=512m unpack-objects <small.pack' in 'origin/master' 1.01 ± 0.04 times faster than './git -C dest.git -c core.bigFileThreshold=512m unpack-objects <small.pack' in 'HEAD~1' 1.01 ± 0.04 times faster than './git -C dest.git -c core.bigFileThreshold=512m unpack-objects <small.pack' in 'HEAD~0' 1.03 ± 0.10 times faster than './git -C dest.git -c core.bigFileThreshold=16k unpack-objects <small.pack' in 'origin/master' 1.02 ± 0.07 times faster than './git -C dest.git -c core.bigFileThreshold=16k unpack-objects <small.pack' in 'HEAD~0' 1.10 ± 0.04 times faster than './git -C dest.git -c core.bigFileThreshold=16k unpack-objects <small.pack' in 'HEAD~1' Therefore, only unpack objects larger than the "core.bigFileThreshold" in zstream. Until now, the config variable has been used in the following cases, and our new case belongs to the packfile category. * Archive: + archive.c: write_entry(): write large blob entries to archive in stream. * Loose objects: + object-file.c: index_fd(): when hashing large files in worktree, read files in a stream, and create one packfile per large blob if want to save files to git object store. + object-file.c: read_loose_object(): when checking loose objects using "git-fsck", do not read full content of large loose objects. * Packfile: + fast-import.c: parse_and_store_blob(): streaming large blob from foreign source to packfile. + index-pack.c: check_collison(): read and check large blob in stream. + index-pack.c: unpack_entry_data(): do not return the entire contents of the big blob from packfile, but uses a fixed buf to perform some integrity checks on the object. + pack-check.c: verify_packfile(): used by "git-fsck" and will call check_object_signature() to check large blob in pack with the streaming interface. + pack-objects.c: get_object_details(): set "no_try_delta" for large blobs when counting objects. + pack-objects.c: write_no_reuse_object(): streaming large blob to pack. + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in stream from packfile. * Others: + diff.c: diff_populate_filespec(): treat large blob file as binary. + streaming.c: istream_source(): as a helper of "open_istream()" to select proper streaming interface to read large blob from packfile. Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx> Helped-by: Derrick Stolee <stolee@xxxxxxxxx> Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx> Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> --- builtin/unpack-objects.c | 71 ++++++++++++++++++++++++++++++++- t/t5328-unpack-large-objects.sh | 23 +++++++++-- 2 files changed, 90 insertions(+), 4 deletions(-) diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index c6d6c17072..e9ec2b349d 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type, } } +struct input_zstream_data { + git_zstream *zstream; + unsigned char buf[8192]; + int status; +}; + +static const void *feed_input_zstream(struct input_stream *in_stream, + unsigned long *readlen) +{ + struct input_zstream_data *data = in_stream->data; + git_zstream *zstream = data->zstream; + void *in = fill(1); + + if (in_stream->is_finished) { + *readlen = 0; + return NULL; + } + + zstream->next_out = data->buf; + zstream->avail_out = sizeof(data->buf); + zstream->next_in = in; + zstream->avail_in = len; + + data->status = git_inflate(zstream, 0); + + in_stream->is_finished = data->status != Z_OK; + use(len - zstream->avail_in); + *readlen = sizeof(data->buf) - zstream->avail_out; + + return data->buf; +} + +static void write_stream_blob(unsigned nr, size_t size) +{ + git_zstream zstream = { 0 }; + struct input_zstream_data data = { 0 }; + struct input_stream in_stream = { + .read = feed_input_zstream, + .data = &data, + }; + + data.zstream = &zstream; + git_inflate_init(&zstream); + + if (stream_loose_object(&in_stream, size, &obj_list[nr].oid)) + die(_("failed to write object in stream")); + + if (data.status != Z_STREAM_END) + die(_("inflate returned (%d)"), data.status); + git_inflate_end(&zstream); + + if (strict) { + struct blob *blob = + lookup_blob(the_repository, &obj_list[nr].oid); + if (blob) + blob->object.flags |= FLAG_WRITTEN; + else + die(_("invalid blob object from stream")); + } + obj_list[nr].obj = NULL; +} + static void unpack_non_delta_entry(enum object_type type, unsigned long size, unsigned nr) { - void *buf = get_data(size); + void *buf; + + /* Write large blob in stream without allocating full buffer. */ + if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) { + write_stream_blob(nr, size); + return; + } + buf = get_data(size); if (buf) write_object(nr, type, buf, size); } diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh index 45a3316e06..f4129979f9 100755 --- a/t/t5328-unpack-large-objects.sh +++ b/t/t5328-unpack-large-objects.sh @@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects' prepare_dest () { test_when_finished "rm -rf dest.git" && - git init --bare dest.git + git init --bare dest.git && + if test -n "$1" + then + git -C dest.git config core.bigFileThreshold $1 + fi } test_no_loose () { @@ -33,16 +37,29 @@ test_expect_success 'set memory limitation to 1MB' ' ' test_expect_success 'unpack-objects failed under memory limitation' ' - prepare_dest && + prepare_dest 2m && test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err && grep "fatal: attempting to allocate" err ' test_expect_success 'unpack-objects works with memory limitation in dry-run mode' ' - prepare_dest && + prepare_dest 2m && git -C dest.git unpack-objects -n <test-$PACK.pack && test_no_loose && test_dir_is_empty dest.git/objects/pack ' +test_expect_success 'unpack big object in stream' ' + prepare_dest 1m && + git -C dest.git unpack-objects <test-$PACK.pack && + test_dir_is_empty dest.git/objects/pack +' + +test_expect_success 'do not unpack existing large objects' ' + prepare_dest 1m && + git -C dest.git index-pack --stdin <test-$PACK.pack && + git -C dest.git unpack-objects <test-$PACK.pack && + test_no_loose +' + test_done -- 2.34.1.52.gc288e771b4.agit.6.5.6