From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use this new interface to unpack all blobs, but doing so would result in a performance penalty of around 10%, as the below "hyperfine" benchmark will show. We therefore limit this to files larger than "core.bigFileThreshold": $ hyperfine \ --setup \ 'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \ --prepare 'rm -rf dest.git && git init --bare dest.git' \ ... Summary './git -C dest.git -c core.bigFileThreshold=512m unpack-objects <small.pack' in 'origin/master' 1.01 ± 0.04 times faster than './git -C dest.git -c core.bigFileThreshold=512m unpack-objects <small.pack' in 'HEAD~1' 1.01 ± 0.04 times faster than './git -C dest.git -c core.bigFileThreshold=512m unpack-objects <small.pack' in 'HEAD~0' 1.03 ± 0.10 times faster than './git -C dest.git -c core.bigFileThreshold=16k unpack-objects <small.pack' in 'origin/master' 1.02 ± 0.07 times faster than './git -C dest.git -c core.bigFileThreshold=16k unpack-objects <small.pack' in 'HEAD~0' 1.10 ± 0.04 times faster than './git -C dest.git -c core.bigFileThreshold=16k unpack-objects <small.pack' in 'HEAD~1' An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@xxxxxxxxx/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@xxxxxxxxx/ Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx> Helped-by: Derrick Stolee <stolee@xxxxxxxxx> Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx> Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> --- Documentation/config/core.txt | 4 +- builtin/unpack-objects.c | 71 ++++++++++++++++++++++++++++++++- t/t5328-unpack-large-objects.sh | 23 +++++++++-- 3 files changed, 92 insertions(+), 6 deletions(-) diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt index b6a12218665..5aca987632c 100644 --- a/Documentation/config/core.txt +++ b/Documentation/config/core.txt @@ -436,8 +436,8 @@ usage, at the slight expense of increased disk usage. * Will be generally be streamed when written, which avoids excessive memory usage, at the cost of some fixed overhead. Commands that make use of this include linkgit:git-archive[1], -linkgit:git-fast-import[1], linkgit:git-index-pack[1] and -linkgit:git-fsck[1]. +linkgit:git-fast-import[1], linkgit:git-index-pack[1], +linkgit:git-unpack-objects[1] and linkgit:git-fsck[1]. core.excludesFile:: Specifies the pathname to the file that contains patterns to diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 896ea8aceb4..7ce3cb61086 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type, } } +struct input_zstream_data { + git_zstream *zstream; + unsigned char buf[8192]; + int status; +}; + +static const void *feed_input_zstream(struct input_stream *in_stream, + unsigned long *readlen) +{ + struct input_zstream_data *data = in_stream->data; + git_zstream *zstream = data->zstream; + void *in = fill(1); + + if (in_stream->is_finished) { + *readlen = 0; + return NULL; + } + + zstream->next_out = data->buf; + zstream->avail_out = sizeof(data->buf); + zstream->next_in = in; + zstream->avail_in = len; + + data->status = git_inflate(zstream, 0); + + in_stream->is_finished = data->status != Z_OK; + use(len - zstream->avail_in); + *readlen = sizeof(data->buf) - zstream->avail_out; + + return data->buf; +} + +static void write_stream_blob(unsigned nr, size_t size) +{ + git_zstream zstream = { 0 }; + struct input_zstream_data data = { 0 }; + struct input_stream in_stream = { + .read = feed_input_zstream, + .data = &data, + }; + + data.zstream = &zstream; + git_inflate_init(&zstream); + + if (stream_loose_object(&in_stream, size, &obj_list[nr].oid)) + die(_("failed to write object in stream")); + + if (data.status != Z_STREAM_END) + die(_("inflate returned (%d)"), data.status); + git_inflate_end(&zstream); + + if (strict) { + struct blob *blob = + lookup_blob(the_repository, &obj_list[nr].oid); + if (blob) + blob->object.flags |= FLAG_WRITTEN; + else + die(_("invalid blob object from stream")); + } + obj_list[nr].obj = NULL; +} + static void unpack_non_delta_entry(enum object_type type, unsigned long size, unsigned nr) { - void *buf = get_data(size); + void *buf; + + /* Write large blob in stream without allocating full buffer. */ + if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) { + write_stream_blob(nr, size); + return; + } + buf = get_data(size); if (buf) write_object(nr, type, buf, size); } diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh index 1432dfc8386..5c1042b4d91 100755 --- a/t/t5328-unpack-large-objects.sh +++ b/t/t5328-unpack-large-objects.sh @@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects' prepare_dest () { test_when_finished "rm -rf dest.git" && - git init --bare dest.git + git init --bare dest.git && + if test -n "$1" + then + git -C dest.git config core.bigFileThreshold $1 + fi } test_no_loose () { @@ -30,16 +34,29 @@ test_expect_success 'set memory limitation to 1MB' ' ' test_expect_success 'unpack-objects failed under memory limitation' ' - prepare_dest && + prepare_dest 2m && test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err && grep "fatal: attempting to allocate" err ' test_expect_success 'unpack-objects works with memory limitation in dry-run mode' ' - prepare_dest && + prepare_dest 2m && git -C dest.git unpack-objects -n <test-$PACK.pack && test_no_loose && test_dir_is_empty dest.git/objects/pack ' +test_expect_success 'unpack big object in stream' ' + prepare_dest 1m && + git -C dest.git unpack-objects <test-$PACK.pack && + test_dir_is_empty dest.git/objects/pack +' + +test_expect_success 'do not unpack existing large objects' ' + prepare_dest 1m && + git -C dest.git index-pack --stdin <test-$PACK.pack && + git -C dest.git unpack-objects <test-$PACK.pack && + test_no_loose +' + test_done -- 2.35.1.940.ge7a5b4b05f2