On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@xxxxxxxxx> wrote: > > From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> > > When calling "unpack_non_delta_entry()", will allocate full memory for > the whole size of the unpacked object and write the buffer to loose file > on disk. This may lead to OOM for the git-unpack-objects process when > unpacking a very large object. > > In function "unpack_delta_entry()", will also allocate full memory to > buffer the whole delta, but since there will be no delta for an object > larger than "core.bigFileThreshold", this issue is moderate. > > To resolve the OOM issue in "git-unpack-objects", we can unpack large > object to file in stream, and use "core.bigFileThreshold" to avoid OOM > limits when called "get_data()". > > Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> > --- > builtin/unpack-objects.c | 76 ++++++++++++++++++++++++- > t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++ > 2 files changed, 167 insertions(+), 1 deletion(-) > create mode 100755 t/t5590-receive-unpack-objects.sh > > diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c > index 4a9466295b..6c757d823b 100644 > --- a/builtin/unpack-objects.c > +++ b/builtin/unpack-objects.c > @@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type, > } > } > > +struct input_data_from_zstream { > + git_zstream *zstream; > + unsigned char buf[4096]; > + int status; > +}; > + > +static const char *read_inflate_in_stream(void *data, unsigned long *readlen) > +{ > + struct input_data_from_zstream *input = data; > + git_zstream *zstream = input->zstream; > + void *in = fill(1); > + > + if (!len || input->status == Z_STREAM_END) { > + *readlen = 0; > + return NULL; > + } > + > + zstream->next_out = input->buf; > + zstream->avail_out = sizeof(input->buf); > + zstream->next_in = in; > + zstream->avail_in = len; > + > + input->status = git_inflate(zstream, 0); > + use(len - zstream->avail_in); > + *readlen = sizeof(input->buf) - zstream->avail_out; > + > + return (const char *)input->buf; > +} > + > +static void write_stream_blob(unsigned nr, unsigned long size) > +{ > + char hdr[32]; > + int hdrlen; > + git_zstream zstream; > + struct input_data_from_zstream data; > + struct input_stream in_stream = { > + .read = read_inflate_in_stream, > + .data = &data, > + }; > + struct object_id *oid = &obj_list[nr].oid; > + int ret; > + > + memset(&zstream, 0, sizeof(zstream)); > + memset(&data, 0, sizeof(data)); > + data.zstream = &zstream; > + git_inflate_init(&zstream); > + > + /* Generate the header */ > + hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1; > + > + if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0))) > + die(_("failed to write object in stream %d"), ret); > + > + if (zstream.total_out != size || data.status != Z_STREAM_END) > + die(_("inflate returned %d"), data.status); > + git_inflate_end(&zstream); > + > + if (strict && !dry_run) { > + struct blob *blob = lookup_blob(the_repository, oid); > + if (blob) > + blob->object.flags |= FLAG_WRITTEN; > + else > + die("invalid blob object from stream"); > + } > + obj_list[nr].obj = NULL; > +} > + > static void unpack_non_delta_entry(enum object_type type, unsigned long size, > unsigned nr) > { > - void *buf = get_data(size); > + void *buf; > + > + /* Write large blob in stream without allocating full buffer. */ > + if (type == OBJ_BLOB && size > big_file_threshold) { Default size of big_file_threshold is 512m. Can we use "write_stream_blob" for all objects? Can we get a more suitable threshold through some benchmark data? > + write_stream_blob(nr, size); > + return; > + } -- Jiang Xin