On Fri, Dec 10 2021, Han Xin wrote: > From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> > > When streaming a large blob object to "write_loose_object()", we have no > chance to run "write_object_file_prepare()" to calculate the oid in > advance. So we need to handle undetermined oid in function > "write_loose_object()". > > In the original implementation, we know the oid and we can write the > temporary file in the same directory as the final object, but for an > object with an undetermined oid, we don't know the exact directory for > the object, so we have to save the temporary file in ".git/objects/" > directory instead. > > The promise that "oid" is constant in "write_loose_object()" has been > removed because it will be filled after reading all stream data. > > Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx> > Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx> > Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx> > --- > object-file.c | 48 +++++++++++++++++++++++++++++++++++++++--------- > 1 file changed, 39 insertions(+), 9 deletions(-) > > diff --git a/object-file.c b/object-file.c > index 06375a90d6..41099b137f 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -1860,11 +1860,11 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename) > return fd; > } > > -static int write_loose_object(const struct object_id *oid, char *hdr, > +static int write_loose_object(struct object_id *oid, char *hdr, > int hdrlen, const void *buf, unsigned long len, > time_t mtime, unsigned flags) > { > - int fd, ret; > + int fd, ret, err = 0; > unsigned char compressed[4096]; > git_zstream stream; > git_hash_ctx c; > @@ -1872,16 +1872,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > static struct strbuf tmp_file = STRBUF_INIT; > static struct strbuf filename = STRBUF_INIT; > > - loose_object_path(the_repository, &filename, oid); > + if (flags & HASH_STREAM) > + /* When oid is not determined, save tmp file to odb path. */ > + strbuf_addf(&filename, "%s/", get_object_directory()); > + else > + loose_object_path(the_repository, &filename, oid); > > fd = create_tmpfile(&tmp_file, filename.buf); > if (fd < 0) { > if (flags & HASH_SILENT) > - return -1; > + err = -1; > else if (errno == EACCES) > - return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); > + err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); > else > - return error_errno(_("unable to create temporary file")); > + err = error_errno(_("unable to create temporary file")); > + goto cleanup; > } > > /* Set it up */ > @@ -1923,12 +1928,34 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), > ret); > the_hash_algo->final_oid_fn(¶no_oid, &c); > - if (!oideq(oid, ¶no_oid)) > + if (!(flags & HASH_STREAM) && !oideq(oid, ¶no_oid)) > die(_("confused by unstable object source data for %s"), > oid_to_hex(oid)); Here we don't have a meaningful "const" OID anymore, but still if we die we use the "oid". > close_loose_object(fd); > > + if (flags & HASH_STREAM) { > + int dirlen; > + > + oidcpy((struct object_id *)oid, ¶no_oid); This cast isn't needed anymore now that you stripped the "const" off, but more on that later... > + loose_object_path(the_repository, &filename, oid); > + > + /* We finally know the object path, and create the missing dir. */ > + dirlen = directory_size(filename.buf); > + if (dirlen) { > + struct strbuf dir = STRBUF_INIT; > + strbuf_add(&dir, filename.buf, dirlen - 1); > + if (mkdir(dir.buf, 0777) && errno != EEXIST) > + err = -1; > + else if (adjust_shared_perm(dir.buf)) > + err = -1; > + else > + strbuf_release(&dir); > + if (err < 0) > + goto cleanup; Can't we use one of the existing utility functions for this? Testing locally I could replace this with: diff --git a/object-file.c b/object-file.c index 7c93db11b2d..05e1fae893d 100644 --- a/object-file.c +++ b/object-file.c @@ -1952,14 +1952,11 @@ static int write_loose_object(struct object_id *oid, char *hdr, if (dirlen) { struct strbuf dir = STRBUF_INIT; strbuf_add(&dir, filename.buf, dirlen - 1); - if (mkdir(dir.buf, 0777) && errno != EEXIST) + + if (mkdir_in_gitdir(dir.buf) < 0) { err = -1; - else if (adjust_shared_perm(dir.buf)) - err = -1; - else - strbuf_release(&dir); - if (err < 0) goto cleanup; + } } } And your tests still pass. Maybe they have a blind spot, or maybe we can just use the existing function. > + } > + } > + > if (mtime) { > struct utimbuf utb; > utb.actime = mtime; > @@ -1938,7 +1965,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > warning_errno(_("failed utime() on %s"), tmp_file.buf); > } > > - return finalize_object_file(tmp_file.buf, filename.buf); > + err = finalize_object_file(tmp_file.buf, filename.buf); > +cleanup: > + strbuf_release(&filename); > + return err; > } Reading this series is an odd mixture of of things that would really be much easier to understand if they were combined, e.g. 1/6 adding APIs that aren't used by anything, but then adding one codepath (also unused), that we then use later. Could just add it at the same time as the use and the patch would be easier to read.... ...and then this, which *is* something that could be split up into an earlier cleanup step, i.e. the strbuf leak here exists before this series, fixing it is good, but splitting that up into its own patch would make this diff smaller & the actual behavior changes easier to reason about. > static int freshen_loose_object(const struct object_id *oid) > @@ -2015,7 +2045,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime) > if (!buf) > return error(_("cannot read object for %s"), oid_to_hex(oid)); > hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1; > - ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0); > + ret = write_loose_object((struct object_id*) oid, hdr, hdrlen, buf, len, mtime, 0); > free(buf); > > return ret; ...on the "more on that later", here we're casting the "oid" from const for a function that's never going to be involved in the streaming codepath. I know I suggested the HASH_STREAM flag, but what I was really going for was "let's share more of the code?", looking at this v5 (which is already much better than v4) I think a better approach is to split up write_loose_object(). I.e. it already calls close_loose_object() and finalize_object_file() to do some of its work, but around that we have: 1. Figuring out a path for the (temp) object file 2. Creating the tempfile 3. Setting up zlib 4. Once zlib is set up inspect its state, die with a message about oid_to_hex(oid) if we failed 5. Optionally, do HASH_STREAM stuff Maybe force a loose object if "mtime". I think if that's split up so that each of those is its own little function what's now write_loose_object() can call those in sequence, and a new stream_loose_object() can just do #1 differentl, followed by the same #2 and #4, but do #4 differently etc. You'll still be able to re-use the write_object_file_prepare() etc. logic. As an example your 5/6 copy/pastes the xsnprintf() formatting of the object header. It's just one line, but it's also code that's very central to git, so I think instead of just copy/pasting it a prep step of factoring it out would make sense, and that would be a prep cleanup that would help later readability. E.g.: diff --git a/object-file.c b/object-file.c index eac67f6f5f9..a7dcbd929e9 100644 --- a/object-file.c +++ b/object-file.c @@ -1009,6 +1009,13 @@ void *xmmap(void *start, size_t length, return ret; } +static int generate_object_header(char *buf, int bufsz, const char *type_name, + unsigned long size) +{ + return xsnprintf(buf, bufsz, "%s %"PRIuMAX , type_name, + (uintmax_t)size) + 1; +} + /* * With an in-core object data in "map", rehash it to make sure the * object name actually matches "oid" to detect object corruption. @@ -1037,7 +1044,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid, return -1; /* Generate the header */ - hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1; + hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(obj_type), size); /* Sha1.. */ r->hash_algo->init_fn(&c); @@ -1737,7 +1744,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo, git_hash_ctx c; /* Generate the header */ - *hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1; + *hdrlen = generate_object_header(hdr, *hdrlen, type, len); /* Sha1.. */ algo->init_fn(&c); @@ -2009,7 +2016,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime) buf = read_object(the_repository, oid, &type, &len); if (!buf) return error(_("cannot read object for %s"), oid_to_hex(oid)); - hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1; + hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(type), len); ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0); free(buf); Then in your change on top you just call that generate_object_header(), or better yet your amended write_object_file_flags() can just call a similarly amended write_object_file_prepare() directly.