Re: [PATCH v2 6/6] unpack-objects: unpack large object in stream

Jiang Xin <worldhello.net@xxxxxxxxx> · Thu, 18 Nov 2021 15:14:50 +0800

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@xxxxxxxxx> wrote:
>
> From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use "core.bigFileThreshold" to avoid OOM
> limits when called "get_data()".
>
> Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
> ---
>  builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
>  t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
>  2 files changed, 167 insertions(+), 1 deletion(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..6c757d823b 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +struct input_data_from_zstream {
> +       git_zstream *zstream;
> +       unsigned char buf[4096];
> +       int status;
> +};
> +
> +static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
> +{
> +       struct input_data_from_zstream *input = data;
> +       git_zstream *zstream = input->zstream;
> +       void *in = fill(1);
> +
> +       if (!len || input->status == Z_STREAM_END) {
> +               *readlen = 0;
> +               return NULL;
> +       }
> +
> +       zstream->next_out = input->buf;
> +       zstream->avail_out = sizeof(input->buf);
> +       zstream->next_in = in;
> +       zstream->avail_in = len;
> +
> +       input->status = git_inflate(zstream, 0);
> +       use(len - zstream->avail_in);
> +       *readlen = sizeof(input->buf) - zstream->avail_out;
> +
> +       return (const char *)input->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       char hdr[32];
> +       int hdrlen;
> +       git_zstream zstream;
> +       struct input_data_from_zstream data;
> +       struct input_stream in_stream = {
> +               .read = read_inflate_in_stream,
> +               .data = &data,
> +       };
> +       struct object_id *oid = &obj_list[nr].oid;
> +       int ret;
> +
> +       memset(&zstream, 0, sizeof(zstream));
> +       memset(&data, 0, sizeof(data));
> +       data.zstream = &zstream;
> +       git_inflate_init(&zstream);
> +
> +       /* Generate the header */
> +       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +       if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
> +               die(_("failed to write object in stream %d"), ret);
> +
> +       if (zstream.total_out != size || data.status != Z_STREAM_END)
> +               die(_("inflate returned %d"), data.status);
> +       git_inflate_end(&zstream);
> +
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {

Default size of big_file_threshold is 512m.  Can we use
"write_stream_blob" for all objects?  Can we get a more suitable
threshold through some benchmark data?

> +               write_stream_blob(nr, size);
> +               return;
> +       }

--
Jiang Xin