Re: [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps

René Scharfe <l.s.r@xxxxxx> · Sat, 8 Jan 2022 13:28:35 +0100

Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
>
> When writing a large blob using "write_loose_object()", we have to pass
> a buffer with the whole content of the blob, and this behavior will
> consume lots of memory and may cause OOM. We will introduce a stream
> version function ("stream_loose_object()") in latter commit to resolve
> this issue.
>
> Before introducing a stream vesion function for writing loose object,
> do some refactoring on "write_loose_object()" to reuse code for both
> versions.
>
> Rewrite "write_loose_object()" as follows:
>
>  1. Figure out a path for the (temp) object file. This step is only
>     used in "write_loose_object()".
>
>  2. Move common steps for starting to write loose objects into a new
>     function "start_loose_object_common()".
>
>  3. Compress data.
>
>  4. Move common steps for ending zlib stream into a new funciton
>     "end_loose_object_common()".
>
>  5. Close fd and finalize the object file.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>
> Helped-by: Jiang Xin <zhiyou.jx@xxxxxxxxxxxxxxx>
> Signed-off-by: Han Xin <hanxin.hx@xxxxxxxxxxxxxxx>
> ---
>  object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 105 insertions(+), 44 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..5d163081b1 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	algo->final_oid_fn(oid, &c);
>  }
>
> +/*
> + * Move the just written object with proper mtime into its final resting place.
> + */
> +static int finalize_object_file_with_mtime(const char *tmpfile,
> +					   const char *filename,
> +					   time_t mtime,
> +					   unsigned flags)

This function is called only once after your series.  Should it be used by
stream_loose_object()?  Probably not -- the latter doesn't have a way to
force a certain modification time and its caller doesn't need one.  So
creating finalize_object_file_with_mtime() seems unnecessary for this
series.

> +{
> +	struct utimbuf utb;
> +
> +	if (mtime) {
> +		utb.actime = mtime;
> +		utb.modtime = mtime;
> +		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
> +			warning_errno(_("failed utime() on %s"), tmpfile);
> +	}
> +	return finalize_object_file(tmpfile, filename);
> +}
> +
>  /*
>   * Move the just written object into its final resting place.
>   */
> @@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
>   * We want to avoid cross-directory filename renames, because those
>   * can have problems on various filesystems (FAT, NFS, Coda).
>   */
> -static int create_tmpfile(struct strbuf *tmp, const char *filename)
> +static int create_tmpfile(struct strbuf *tmp, const char *filename,
> +			  unsigned flags)

create_tmpfile() is not mentioned in the commit message, yet it's
changed here.  Hrm.

>  {
>  	int fd, dirlen = directory_size(filename);
>
> @@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	strbuf_add(tmp, filename, dirlen);
>  	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
>  	fd = git_mkstemp_mode(tmp->buf, 0444);
> -	if (fd < 0 && dirlen && errno == ENOENT) {
> +	do {
> +		if (fd >= 0 || !dirlen || errno != ENOENT)
> +			break;

Why turn this branch into a loop?  Is this done to mkdir multiple
components, e.g. with filename being "a/b/c/file" to create "a", "a/b",
and "a/b/c"?  It's only used for loose objects, so a fan-out directory
(e.g. ".git/objects/ff") can certainly be missing, but can their parent
be missing as well sometimes?  If that's the point then such a fix
would be worth its own patch.  (Which probably would benefit from using
safe_create_leading_directories()).

>  		/*
>  		 * Make sure the directory exists; note that the contents
>  		 * of the buffer are undefined after mkstemp returns an
> @@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  		strbuf_reset(tmp);
>  		strbuf_add(tmp, filename, dirlen - 1);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> -			return -1;
> +			break;
>  		if (adjust_shared_perm(tmp->buf))
> -			return -1;
> +			break;

Or is it just to replace these returns with a jump to the new error
reporting section?

>
>  		/* Try again */
>  		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  		fd = git_mkstemp_mode(tmp->buf, 0444);

In that case a break would be missing here.

> +	} while (0);
> +
> +	if (fd < 0 && !(flags & HASH_SILENT)) {
> +		if (errno == EACCES)
> +			return error(_("insufficient permission for adding an "
> +				       "object to repository database %s"),
> +				     get_object_directory());
> +		else
> +			return error_errno(_("unable to create temporary file"));
>  	}

Why move this error reporting code into create_tmpfile()?  This function
has a single caller both before and after your series, so the code could
just as well stay at its call-site, avoiding the need to add the flags
parameter.

> +
>  	return fd;
>  }
>
> +static int start_loose_object_common(struct strbuf *tmp_file,
> +				     const char *filename, unsigned flags,
> +				     git_zstream *stream,
> +				     unsigned char *buf, size_t buflen,
> +				     git_hash_ctx *c,
> +				     enum object_type type, size_t len,

The parameters type and len are not used by this function and thus can
be dropped.

> +				     char *hdr, int hdrlen)
> +{
> +	int fd;
> +
> +	fd = create_tmpfile(tmp_file, filename, flags);
> +	if (fd < 0)
> +		return -1;
> +
> +	/*  Setup zlib stream for compression */
> +	git_deflate_init(stream, zlib_compression_level);
> +	stream->next_out = buf;
> +	stream->avail_out = buflen;
> +	the_hash_algo->init_fn(c);
> +
> +	/*  Start to feed header to zlib stream */
> +	stream->next_in = (unsigned char *)hdr;
> +	stream->avail_in = hdrlen;
> +	while (git_deflate(stream, 0) == Z_OK)
> +		; /* nothing */
> +	the_hash_algo->update_fn(c, hdr, hdrlen);
> +
> +	return fd;
> +}
> +
> +static void end_loose_object_common(int ret, git_hash_ctx *c,
> +				    git_zstream *stream,
> +				    struct object_id *parano_oid,
> +				    const struct object_id *expected_oid,
> +				    const char *die_msg1_fmt,
> +				    const char *die_msg2_fmt)

Hmm, the signature needs as many lines as the function body.

> +{
> +	if (ret != Z_STREAM_END)
> +		die(_(die_msg1_fmt), ret, expected_oid);
> +	ret = git_deflate_end_gently(stream);
> +	if (ret != Z_OK)
> +		die(_(die_msg2_fmt), ret, expected_oid);

These format strings cannot be checked by the compiler.

Considering those two together I think I'd either unify the error
messages and move their strings here (losing the ability for users
to see if streaming was used) or not extract the function and
duplicate its few shared lines.  Just a feeling, though.

> +	the_hash_algo->final_oid_fn(parano_oid, c);
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
> @@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>  	loose_object_path(the_repository, &filename, oid);
>
> -	fd = create_tmpfile(&tmp_file, filename.buf);
> -	if (fd < 0) {
> -		if (flags & HASH_SILENT)
> -			return -1;
> -		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -		else
> -			return error_errno(_("unable to create temporary file"));
> -	}
> -
> -	/* Set it up */
> -	git_deflate_init(&stream, zlib_compression_level);
> -	stream.next_out = compressed;
> -	stream.avail_out = sizeof(compressed);
> -	the_hash_algo->init_fn(&c);
> -
> -	/* First header.. */
> -	stream.next_in = (unsigned char *)hdr;
> -	stream.avail_in = hdrlen;
> -	while (git_deflate(&stream, 0) == Z_OK)
> -		; /* nothing */
> -	the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * start writing loose oject:
> +	 *
> +	 *  - Create tmpfile for the loose object.
> +	 *  - Setup zlib stream for compression.
> +	 *  - Start to feed header to zlib stream.
> +	 */
> +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
> +				       &stream, compressed, sizeof(compressed),
> +				       &c, OBJ_NONE, 0, hdr, hdrlen);
> +	if (fd < 0)
> +		return -1;
>
>  	/* Then the data itself.. */
>  	stream.next_in = (void *)buf;
> @@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		stream.avail_out = sizeof(compressed);
>  	} while (ret == Z_OK);
>
> -	if (ret != Z_STREAM_END)
> -		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> -		    ret);
> -	ret = git_deflate_end_gently(&stream);
> -	if (ret != Z_OK)
> -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> -		    ret);
> -	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * end writing loose oject:
> +	 *
> +	 *  - End the compression of zlib stream.
> +	 *  - Get the calculated oid to "parano_oid".
> +	 */
> +	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
> +				N_("unable to deflate new object %s (%d)"),
> +				N_("deflateEnd on object %s failed (%d)"));
> +
>  	if (!oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>
>  	close_loose_object(fd);
>
> -	if (mtime) {
> -		struct utimbuf utb;
> -		utb.actime = mtime;
> -		utb.modtime = mtime;
> -		if (utime(tmp_file.buf, &utb) < 0 &&
> -		    !(flags & HASH_SILENT))
> -			warning_errno(_("failed utime() on %s"), tmp_file.buf);
> -	}
> -
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
> +					       mtime, flags);
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)