[PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream

Han Xin <chiyutianyi@xxxxxxxxx> · Sat, 8 Jan 2022 16:54:18 +0800

From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()". However, unpack
non-delta objects from a stream instead of from an entrie buffer will
have 10% performance penalty.

    $ hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare
       https://github.com/microsoft/scalar.git;
       cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      ...

    Summary
      './git -C dest.git -c core.bigFileThreshold=512m
      unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~1'

Therefore, only unpack objects larger than the "core.bigFileThreshold"
in zstream. Until now, the config variable has been used in the
following cases, and our new case belongs to the packfile category.

 * Archive:

   + archive.c: write_entry(): write large blob entries to archive in
     stream.

 * Loose objects:

   + object-file.c: index_fd(): when hashing large files in worktree,
     read files in a stream, and create one packfile per large blob if
     want to save files to git object store.

   + object-file.c: read_loose_object(): when checking loose objects
     using "git-fsck", do not read full content of large loose objects.

 * Packfile:

   + fast-import.c: parse_and_store_blob(): streaming large blob from
     foreign source to packfile.

   + index-pack.c: check_collison(): read and check large blob in stream.

   + index-pack.c: unpack_entry_data(): do not return the entire
     contents of the big blob from packfile, but uses a fixed buf to
     perform some integrity checks on the object.

   + pack-check.c: verify_packfile(): used by "git-fsck" and will call
     check_object_signature() to check large blob in pack with the
     streaming interface.

   + pack-objects.c: get_object_details(): set "no_try_delta" for large
     blobs when counting objects.

   + pack-objects.c: write_no_reuse_object(): streaming large blob to
     pack.

   + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
     stream from packfile.

 * Others:

   + diff.c: diff_populate_filespec(): treat large blob file as binary.

   + streaming.c: istream_source(): as a helper of "open_istream()" to
     select proper streaming interface to read large blob from packfile.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>
Helped-by: Derrick Stolee <stolee@xxxxxxxxx>
Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx>
Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
---
 builtin/unpack-objects.c        | 71 ++++++++++++++++++++++++++++++++-
 t/t5329-unpack-large-objects.sh | 23 +++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c6d6c17072..e9ec2b349d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (buf)
 		write_object(nr, type, buf, size);
 }
diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
index 39c7a62d94..6f3bfb3df7 100755
--- a/t/t5329-unpack-large-objects.sh
+++ b/t/t5329-unpack-large-objects.sh
@@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 assert_no_loose () {
@@ -37,16 +41,29 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
 	assert_no_loose &&
 	assert_no_pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	assert_no_pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	assert_no_loose
+'
+
 test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6