Re: [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object()

Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx> · Mon, 13 Dec 2021 08:32:58 +0100

On Fri, Dec 10 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> The promise that "oid" is constant in "write_loose_object()" has been
> removed because it will be filled after reading all stream data.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>
> Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx>
> Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
> ---
>  object-file.c | 48 +++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 39 insertions(+), 9 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 06375a90d6..41099b137f 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,11 +1860,11 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> -static int write_loose_object(const struct object_id *oid, char *hdr,
> +static int write_loose_object(struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
>  {
> -	int fd, ret;
> +	int fd, ret, err = 0;
>  	unsigned char compressed[4096];
>  	git_zstream stream;
>  	git_hash_ctx c;
> @@ -1872,16 +1872,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	static struct strbuf tmp_file = STRBUF_INIT;
>  	static struct strbuf filename = STRBUF_INIT;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (flags & HASH_STREAM)
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_addf(&filename, "%s/", get_object_directory());
> +	else
> +		loose_object_path(the_repository, &filename, oid);
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
>  		if (flags & HASH_SILENT)
> -			return -1;
> +			err = -1;
>  		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
>  		else
> -			return error_errno(_("unable to create temporary file"));
> +			err = error_errno(_("unable to create temporary file"));
> +		goto cleanup;
>  	}
>  
>  	/* Set it up */
> @@ -1923,12 +1928,34 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!(flags & HASH_STREAM) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));

Here we don't have a meaningful "const" OID anymore, but still if we die
we use the "oid". 

>  	close_loose_object(fd);
>  
> +	if (flags & HASH_STREAM) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);

This cast isn't needed anymore now that you stripped the "const" off,
but more on that later...

> +		loose_object_path(the_repository, &filename, oid);
> +
> +		/* We finally know the object path, and create the missing dir. */
> +		dirlen = directory_size(filename.buf);
> +		if (dirlen) {
> +			struct strbuf dir = STRBUF_INIT;
> +			strbuf_add(&dir, filename.buf, dirlen - 1);
> +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +				err = -1;
> +			else if (adjust_shared_perm(dir.buf))
> +				err = -1;
> +			else
> +				strbuf_release(&dir);
> +			if (err < 0)
> +				goto cleanup;

Can't we use one of the existing utility functions for this? Testing
locally I could replace this with:

	diff --git a/object-file.c b/object-file.c
	index 7c93db11b2d..05e1fae893d 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1952,14 +1952,11 @@ static int write_loose_object(struct object_id *oid, char *hdr,
	 		if (dirlen) {
	 			struct strbuf dir = STRBUF_INIT;
	 			strbuf_add(&dir, filename.buf, dirlen - 1);
	-			if (mkdir(dir.buf, 0777) && errno != EEXIST)
	+			
	+			if (mkdir_in_gitdir(dir.buf) < 0) {
	 				err = -1;
	-			else if (adjust_shared_perm(dir.buf))
	-				err = -1;
	-			else
	-				strbuf_release(&dir);
	-			if (err < 0)
	 				goto cleanup;
	+			}
	 		}
	 	}

And your tests still pass. Maybe they have a blind spot, or maybe we can
just use the existing function.

> +		}
> +	}
> +
>  	if (mtime) {
>  		struct utimbuf utb;
>  		utb.actime = mtime;
> @@ -1938,7 +1965,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  			warning_errno(_("failed utime() on %s"), tmp_file.buf);
>  	}
>  
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +	strbuf_release(&filename);
> +	return err;
>  }

Reading this series is an odd mixture of of things that would really be
much easier to understand if they were combined, e.g. 1/6 adding APIs
that aren't used by anything, but then adding one codepath (also
unused), that we then use later. Could just add it at the same time as
the use and the patch would be easier to read....

...and then this, which *is* something that could be split up into an
earlier cleanup step, i.e. the strbuf leak here exists before this
series, fixing it is good, but splitting that up into its own patch
would make this diff smaller & the actual behavior changes easier to
reason about.

>  static int freshen_loose_object(const struct object_id *oid)
> @@ -2015,7 +2045,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
>  	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +	ret = write_loose_object((struct object_id*) oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>  
>  	return ret;

 ...on the "more on that later", here we're casting the "oid" from const
 for a function that's never going to be involved in the streaming
 codepath.

I know I suggested the HASH_STREAM flag, but what I was really going for
was "let's share more of the code?", looking at this v5 (which is
already much better than v4) I think a better approach is to split up
write_loose_object().

I.e. it already calls close_loose_object() and finalize_object_file() to
do some of its work, but around that we have:

 1. Figuring out a path for the (temp) object file
 2. Creating the tempfile
 3. Setting up zlib
 4. Once zlib is set up inspect its state, die with a message
    about oid_to_hex(oid) if we failed
 5. Optionally, do HASH_STREAM stuff
    Maybe force a loose object if "mtime".

I think if that's split up so that each of those is its own little
function what's now write_loose_object() can call those in sequence, and
a new stream_loose_object() can just do #1 differentl, followed by the
same #2 and #4, but do #4 differently etc.

You'll still be able to re-use the write_object_file_prepare()
etc. logic.

As an example your 5/6 copy/pastes the xsnprintf() formatting of the
object header. It's just one line, but it's also code that's very
central to git, so I think instead of just copy/pasting it a prep step
of factoring it out would make sense, and that would be a prep cleanup
that would help later readability. E.g.:

	diff --git a/object-file.c b/object-file.c
	index eac67f6f5f9..a7dcbd929e9 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1009,6 +1009,13 @@ void *xmmap(void *start, size_t length,
	 	return ret;
	 }

	+static int generate_object_header(char *buf, int bufsz, const char *type_name,
	+				  unsigned long size)
	+{
	+	return xsnprintf(buf, bufsz, "%s %"PRIuMAX , type_name,
	+			 (uintmax_t)size) + 1;
	+}
	+
	 /*
	  * With an in-core object data in "map", rehash it to make sure the
	  * object name actually matches "oid" to detect object corruption.
	@@ -1037,7 +1044,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
	 		return -1;

	 	/* Generate the header */
	-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
	+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(obj_type), size);

	 	/* Sha1.. */
	 	r->hash_algo->init_fn(&c);
	@@ -1737,7 +1744,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
	 	git_hash_ctx c;

	 	/* Generate the header */
	-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
	+	*hdrlen = generate_object_header(hdr, *hdrlen, type, len);

	 	/* Sha1.. */
	 	algo->init_fn(&c);
	@@ -2009,7 +2016,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
	 	buf = read_object(the_repository, oid, &type, &len);
	 	if (!buf)
	 		return error(_("cannot read object for %s"), oid_to_hex(oid));
	-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
	+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(type), len);
	 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
	 	free(buf);

Then in your change on top you just call that generate_object_header(),
or better yet your amended write_object_file_flags() can just call a
similarly amended write_object_file_prepare() directly.