Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream

Han Xin <chiyutianyi@xxxxxxxxx> · Wed, 24 Nov 2021 17:00:00 +0800

Junio C Hamano <gitster@xxxxxxxxx> writes:

>
> Han Xin <chiyutianyi@xxxxxxxxx> writes:
>
> > From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
>
> Possibly a stupid question (not a review).
>
> How does this compare with "struct git_istream" implemented for a
> few existing codepaths?  It seems that the existing users are
> pack-objects, index-pack and archive and all of them use the
> interface to obtain data given an object name without having to grab
> everything in core at once.
>
> If we are adding a new streaming interface to go in the opposite
> direction, i.e. from the working tree data to object store, I would
> understand it as a complementary interface (but then I suspect there
> is a half of it already in bulk-checkin API), but I am not sure how
> this new thing fits in the larger picture.
>

Thank you for your reply.

Before starting to make this patch, I did consider whether I should
reuse "struct  git_istream" to solve the problem, but I found that in the
process of git unpack-objects, the data comes from stdin, and we
cannot get an oid in advance until the whole object data is read.
Also, we can't do "lseek()“ on stdin to change the data reading position.

I compared the implementation of "bulk-checkin", and they do have
some similarities.
I think the difference in the reverse implementation is that we do not
always clearly know where the boundary of the target data is. For
example, in the process of "unpack-objects", the "buffer" has been
partially read after calling "fill()". And the "buffer" remaining after
reading cannot be discarded because it is the beginning of the next
object.
Perhaps "struct input_stream" can make some improvements to
"index_bulk_checkin()", so that it can read from an inner buffer in
addition to reading from "fd" if necessary.

>
>
> > Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx>
> > Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
> > ---
> >  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  5 +++++
> >  2 files changed, 51 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index c3d866a287..227f53a0de 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
> > +
> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> > -                           int hdrlen, const void *buf, unsigned long len,
> > +                           int hdrlen, struct input_stream *in_stream,
> >                             time_t mtime, unsigned flags)
> >  {
> >       int fd, ret;
> > @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       struct object_id parano_oid;
> >       static struct strbuf tmp_file = STRBUF_INIT;
> >       static struct strbuf filename = STRBUF_INIT;
> > +     const void *buf;
> > +     unsigned long len;
> >
> >       loose_object_path(the_repository, &filename, oid);
> >
> > @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > +     buf = in_stream->read(in_stream, &len);
> >       stream.next_in = (void *)buf;
> >       stream.avail_in = len;
> >       do {
> > @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >  {
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen = sizeof(hdr);
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* Normally if we have it in the pack then we do not bother writing
> >        * it out into .git/objects/??/?{38} file.
> > @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >                                 &hdrlen);
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               return 0;
> > -     return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> > +     return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> >  }
> >
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
> > +     data.buf = buf;
> > +     data.len = len;
> >       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> > -     ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> > +     ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> >       free(buf);
> >
> >       return ret;
> > diff --git a/object-store.h b/object-store.h
> > index 952efb6a4b..ccc1fc9c1a 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -34,6 +34,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)