From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> Changes since v7: * Use functions "assert_no_loose()" and "assert_no_pack()" to do tests instead of "find" sugguseted by Ævar Arnfjörð Bjarmason[1]. * "get_data()" now use the global "dry_run" and it will release the buf before returning. * Add a new commit "object-file.c: remove the slash for directory_size()" sugguseted by Ævar Arnfjörð Bjarmason[2]. * Add "int is_finished" to "struct input_stream" who will tell us if there is next buffer in the stream. * Remove the config "core.bigFileStreamingThreshold" introduced in v5, and keep using "core.bigFileThreshold". Until now, the config variable has been used in the cases listed in "unpack-objects: unpack_non_delta_entry() read data in a stream", this new case belongs to the packfile category. * Remove unnecessary explicit cast in "object-file API: add a format_object_header() function" sugguseted by René Scharfe[3]. 1. https://lore.kernel.org/git/211221.86bl1arqls.gmgdl@xxxxxxxxxxxxxxxxxxx/ 2. https://lore.kernel.org/git/211221.8635mmrpps.gmgdl@xxxxxxxxxxxxxxxxxxx/ 3. https://lore.kernel.org/git/b2dee243-1a38-531e-02b1-ffd66c465fa5@xxxxxx/ Han Xin (5): unpack-objects: low memory footprint for get_data() in dry_run mode object-file.c: refactor write_loose_object() to several steps object-file.c: remove the slash for directory_size() object-file.c: add "stream_loose_object()" to handle large object unpack-objects: unpack_non_delta_entry() read data in a stream Ævar Arnfjörð Bjarmason (1): object-file API: add a format_object_header() function builtin/index-pack.c | 3 +- builtin/unpack-objects.c | 110 +++++++++++-- bulk-checkin.c | 4 +- cache.h | 21 +++ http-push.c | 2 +- object-file.c | 272 ++++++++++++++++++++++++++------ object-store.h | 9 ++ t/t5329-unpack-large-objects.sh | 69 ++++++++ 8 files changed, 422 insertions(+), 68 deletions(-) create mode 100755 t/t5329-unpack-large-objects.sh Range-diff against v7: 1: a8f232f553 < -: ---------- unpack-objects.c: add dry_run mode for get_data() -: ---------- > 1: bd34da5816 unpack-objects: low memory footprint for get_data() in dry_run mode 3: a571b8f16c ! 2: f9a4365a7d object-file.c: refactor write_loose_object() to reuse in stream version @@ Metadata Author: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> ## Commit message ## - object-file.c: refactor write_loose_object() to reuse in stream version + object-file.c: refactor write_loose_object() to several steps - We used to call "get_data()" in "unpack_non_delta_entry()" to read the - entire contents of a blob object, no matter how big it is. This - implementation may consume all the memory and cause OOM. + When writing a large blob using "write_loose_object()", we have to pass + a buffer with the whole content of the blob, and this behavior will + consume lots of memory and may cause OOM. We will introduce a stream + version function ("stream_loose_object()") in latter commit to resolve + this issue. - This can be improved by feeding data to "stream_loose_object()" in - stream instead of read into the whole buf. + Before introducing a stream vesion function for writing loose object, + do some refactoring on "write_loose_object()" to reuse code for both + versions. - As this new method "stream_loose_object()" has many similarities with - "write_loose_object()", we split up "write_loose_object()" into some - steps: - 1. Figuring out a path for the (temp) object file. - 2. Creating the tempfile. - 3. Setting up zlib and write header. - 4. Write object data and handle errors. - 5. Optionally, do someting after write, maybe force a loose object if - "mtime". + Rewrite "write_loose_object()" as follows: + + 1. Figure out a path for the (temp) object file. This step is only + used in "write_loose_object()". + + 2. Move common steps for starting to write loose objects into a new + function "start_loose_object_common()". + + 3. Compress data. + + 4. Move common steps for ending zlib stream into a new funciton + "end_loose_object_common()". + + 5. Close fd and finalize the object file. Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx> + Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx> Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> ## object-file.c ## @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam return fd; } -+static void setup_stream_and_header(git_zstream *stream, -+ unsigned char *compressed, -+ unsigned long compressed_size, -+ git_hash_ctx *c, -+ char *hdr, -+ int hdrlen) ++static int start_loose_object_common(struct strbuf *tmp_file, ++ const char *filename, unsigned flags, ++ git_zstream *stream, ++ unsigned char *buf, size_t buflen, ++ git_hash_ctx *c, ++ enum object_type type, size_t len, ++ char *hdr, int hdrlen) +{ -+ /* Set it up */ ++ int fd; ++ ++ fd = create_tmpfile(tmp_file, filename, flags); ++ if (fd < 0) ++ return -1; ++ ++ /* Setup zlib stream for compression */ + git_deflate_init(stream, zlib_compression_level); -+ stream->next_out = compressed; -+ stream->avail_out = compressed_size; ++ stream->next_out = buf; ++ stream->avail_out = buflen; + the_hash_algo->init_fn(c); + -+ /* First header.. */ ++ /* Start to feed header to zlib stream */ + stream->next_in = (unsigned char *)hdr; + stream->avail_in = hdrlen; + while (git_deflate(stream, 0) == Z_OK) + ; /* nothing */ + the_hash_algo->update_fn(c, hdr, hdrlen); ++ ++ return fd; ++} ++ ++static void end_loose_object_common(int ret, git_hash_ctx *c, ++ git_zstream *stream, ++ struct object_id *parano_oid, ++ const struct object_id *expected_oid, ++ const char *die_msg1_fmt, ++ const char *die_msg2_fmt) ++{ ++ if (ret != Z_STREAM_END) ++ die(_(die_msg1_fmt), ret, expected_oid); ++ ret = git_deflate_end_gently(stream); ++ if (ret != Z_OK) ++ die(_(die_msg2_fmt), ret, expected_oid); ++ the_hash_algo->final_oid_fn(parano_oid, c); +} + static int write_loose_object(const struct object_id *oid, char *hdr, @@ object-file.c: static int write_loose_object(const struct object_id *oid, char * - stream.next_out = compressed; - stream.avail_out = sizeof(compressed); - the_hash_algo->init_fn(&c); -+ fd = create_tmpfile(&tmp_file, filename.buf, flags); -+ if (fd < 0) -+ return -1; - +- - /* First header.. */ - stream.next_in = (unsigned char *)hdr; - stream.avail_in = hdrlen; - while (git_deflate(&stream, 0) == Z_OK) - ; /* nothing */ - the_hash_algo->update_fn(&c, hdr, hdrlen); -+ /* Set it up and write header */ -+ setup_stream_and_header(&stream, compressed, sizeof(compressed), -+ &c, hdr, hdrlen); ++ /* Common steps for write_loose_object and stream_loose_object to ++ * start writing loose oject: ++ * ++ * - Create tmpfile for the loose object. ++ * - Setup zlib stream for compression. ++ * - Start to feed header to zlib stream. ++ */ ++ fd = start_loose_object_common(&tmp_file, filename.buf, flags, ++ &stream, compressed, sizeof(compressed), ++ &c, OBJ_NONE, 0, hdr, hdrlen); ++ if (fd < 0) ++ return -1; /* Then the data itself.. */ stream.next_in = (void *)buf; @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr, + stream.avail_out = sizeof(compressed); + } while (ret == Z_OK); + +- if (ret != Z_STREAM_END) +- die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid), +- ret); +- ret = git_deflate_end_gently(&stream); +- if (ret != Z_OK) +- die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), +- ret); +- the_hash_algo->final_oid_fn(¶no_oid, &c); ++ /* Common steps for write_loose_object and stream_loose_object to ++ * end writing loose oject: ++ * ++ * - End the compression of zlib stream. ++ * - Get the calculated oid to "parano_oid". ++ */ ++ end_loose_object_common(ret, &c, &stream, ¶no_oid, oid, ++ N_("unable to deflate new object %s (%d)"), ++ N_("deflateEnd on object %s failed (%d)")); ++ + if (!oideq(oid, ¶no_oid)) + die(_("confused by unstable object source data for %s"), + oid_to_hex(oid)); close_loose_object(fd); -: ---------- > 3: 18dd21122d object-file.c: remove the slash for directory_size() -: ---------- > 4: 964715451b object-file.c: add "stream_loose_object()" to handle large object -: ---------- > 5: 3f620466fe unpack-objects: unpack_non_delta_entry() read data in a stream 2: 0d2e0f3a00 ! 6: 8073a3888d object-file API: add a format_object_header() function @@ builtin/index-pack.c: static void *unpack_entry_data(off_t offset, unsigned long if (!is_delta_type(type)) { - hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, - type_name(type),(uintmax_t)size) + 1; -+ hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size); ++ hdrlen = format_object_header(hdr, sizeof(hdr), type, size); the_hash_algo->init_fn(&c); the_hash_algo->update_fn(&c, hdr, hdrlen); } else @@ bulk-checkin.c: static int deflate_to_pack(struct bulk_checkin_state *state, - header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX, - type_name(type), (uintmax_t)size) + 1; + header_len = format_object_header((char *)obuf, sizeof(obuf), -+ type, (uintmax_t)size); ++ type, size); the_hash_algo->init_fn(&ctx); the_hash_algo->update_fn(&ctx, obuf, header_len); @@ http-push.c: static void start_put(struct transfer_request *request) unpacked = read_object_file(&request->obj->oid, &type, &len); - hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1; -+ hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len); ++ hdrlen = format_object_header(hdr, sizeof(hdr), type, len); /* Set it up */ git_deflate_init(&stream, zlib_compression_level); @@ object-file.c: static void write_object_file_prepare(const struct git_hash_algo /* Sha1.. */ algo->init_fn(&c); +@@ object-file.c: int stream_loose_object(struct input_stream *in_stream, size_t len, + + /* Since oid is not determined, save tmp file to odb path. */ + strbuf_addf(&filename, "%s/", get_object_directory()); +- hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1; ++ hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len); + + /* Common steps for write_loose_object and stream_loose_object to + * start writing loose oject: @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime) buf = read_object(the_repository, oid, &type, &len); if (!buf) 4: 1de06a8f5c < -: ---------- object-file.c: add "write_stream_object_file()" to support read in stream 5: e7b4e426ef < -: ---------- unpack-objects: unpack_non_delta_entry() read data in a stream -- 2.34.1.52.gc288e771b4.agit.6.5.6