Re: [PATCH v2 5/7] bulk-checkin: introduce `index_blob_bulk_checkin_incore()`

Junio C Hamano <gitster@xxxxxxxxx> · Tue, 17 Oct 2023 19:18:04 -0700

Taylor Blau <me@xxxxxxxxxxxx> writes:

>  bulk-checkin.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++
>  bulk-checkin.h |   4 ++
>  2 files changed, 122 insertions(+)

Unlike the previous four, which were very clear refactoring to
create reusable helper functions, this step leaves a bad aftertaste
after reading twice, and I think what is disturbing is that many new
lines are pretty much literally copied from stream_blob_to_pack().

I wonder if we can introduce an "input" source abstraction, that
replaces "fd" and "size" (and "path" for error reporting) parameters
to the stream_blob_to_pack(), so that the bulk of the implementation
of stream_blob_to_pack() can call its .read() method to read bytes
up to "size" from such an abstracted interface?  That would be a
good sized first half of this change.  Then in the second half, you
can add another "input" source that works with in-core "buf" and
"size", whose .read() method will merely be a memcpy().

That way, we will have two functions, one for stream_obj_to_pack()
that reads from an open file descriptor, and the other for
stream_obj_to_pack_incore() that reads from an in-core buffer,
sharing the bulk of the implementation that is extracted from the
current code, which hopefully be easier to audit.

> diff --git a/bulk-checkin.c b/bulk-checkin.c
> index f4914fb6d1..25cd1ffa25 100644
> --- a/bulk-checkin.c
> +++ b/bulk-checkin.c
> @@ -140,6 +140,69 @@ static int already_written(struct bulk_checkin_packfile *state, struct object_id
>  	return 0;
>  }
>  
> +static int stream_obj_to_pack_incore(struct bulk_checkin_packfile *state,
> +				     git_hash_ctx *ctx,
> +				     off_t *already_hashed_to,
> +				     const void *buf, size_t size,
> +				     enum object_type type,
> +				     const char *path, unsigned flags)
> +{
> +	git_zstream s;
> +	unsigned char obuf[16384];
> +	unsigned hdrlen;
> +	int status = Z_OK;
> +	int write_object = (flags & HASH_WRITE_OBJECT);
> +
> +	git_deflate_init(&s, pack_compression_level);
> +
> +	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), type, size);
> +	s.next_out = obuf + hdrlen;
> +	s.avail_out = sizeof(obuf) - hdrlen;
> +
> +	if (*already_hashed_to < size) {
> +		size_t hsize = size - *already_hashed_to;
> +		if (hsize) {
> +			the_hash_algo->update_fn(ctx, buf, hsize);
> +		}
> +		*already_hashed_to = size;
> +	}
> +	s.next_in = (void *)buf;
> +	s.avail_in = size;
> +
> +	while (status != Z_STREAM_END) {
> +		status = git_deflate(&s, Z_FINISH);
> +		if (!s.avail_out || status == Z_STREAM_END) {
> +			if (write_object) {
> +				size_t written = s.next_out - obuf;
> +
> +				/* would we bust the size limit? */
> +				if (state->nr_written &&
> +				    pack_size_limit_cfg &&
> +				    pack_size_limit_cfg < state->offset + written) {
> +					git_deflate_abort(&s);
> +					return -1;
> +				}
> +
> +				hashwrite(state->f, obuf, written);
> +				state->offset += written;
> +			}
> +			s.next_out = obuf;
> +			s.avail_out = sizeof(obuf);
> +		}
> +
> +		switch (status) {
> +		case Z_OK:
> +		case Z_BUF_ERROR:
> +		case Z_STREAM_END:
> +			continue;
> +		default:
> +			die("unexpected deflate failure: %d", status);
> +		}
> +	}
> +	git_deflate_end(&s);
> +	return 0;
> +}
> +
>  /*
>   * Read the contents from fd for size bytes, streaming it to the
>   * packfile in state while updating the hash in ctx. Signal a failure
> @@ -316,6 +379,50 @@ static void finalize_checkpoint(struct bulk_checkin_packfile *state,
>  	}
>  }
>  
> +static int deflate_obj_contents_to_pack_incore(struct bulk_checkin_packfile *state,
> +					       git_hash_ctx *ctx,
> +					       struct hashfile_checkpoint *checkpoint,
> +					       struct object_id *result_oid,
> +					       const void *buf, size_t size,
> +					       enum object_type type,
> +					       const char *path, unsigned flags)
> +{
> +	struct pack_idx_entry *idx = NULL;
> +	off_t already_hashed_to = 0;
> +
> +	/* Note: idx is non-NULL when we are writing */
> +	if (flags & HASH_WRITE_OBJECT)
> +		CALLOC_ARRAY(idx, 1);
> +
> +	while (1) {
> +		prepare_checkpoint(state, checkpoint, idx, flags);
> +		if (!stream_obj_to_pack_incore(state, ctx, &already_hashed_to,
> +					       buf, size, type, path, flags))
> +			break;
> +		truncate_checkpoint(state, checkpoint, idx);
> +	}
> +
> +	finalize_checkpoint(state, ctx, checkpoint, idx, result_oid);
> +
> +	return 0;
> +}
> +
> +static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
> +				       struct object_id *result_oid,
> +				       const void *buf, size_t size,
> +				       const char *path, unsigned flags)
> +{
> +	git_hash_ctx ctx;
> +	struct hashfile_checkpoint checkpoint = {0};
> +
> +	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB,
> +				  size);
> +
> +	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
> +						   result_oid, buf, size,
> +						   OBJ_BLOB, path, flags);
> +}
> +
>  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
>  				struct object_id *result_oid,
>  				int fd, size_t size,
> @@ -396,6 +503,17 @@ int index_blob_bulk_checkin(struct object_id *oid,
>  	return status;
>  }
>  
> +int index_blob_bulk_checkin_incore(struct object_id *oid,
> +				   const void *buf, size_t size,
> +				   const char *path, unsigned flags)
> +{
> +	int status = deflate_blob_to_pack_incore(&bulk_checkin_packfile, oid,
> +						 buf, size, path, flags);
> +	if (!odb_transaction_nesting)
> +		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
> +	return status;
> +}
> +
>  void begin_odb_transaction(void)
>  {
>  	odb_transaction_nesting += 1;
> diff --git a/bulk-checkin.h b/bulk-checkin.h
> index aa7286a7b3..1b91daeaee 100644
> --- a/bulk-checkin.h
> +++ b/bulk-checkin.h
> @@ -13,6 +13,10 @@ int index_blob_bulk_checkin(struct object_id *oid,
>  			    int fd, size_t size,
>  			    const char *path, unsigned flags);
>  
> +int index_blob_bulk_checkin_incore(struct object_id *oid,
> +				   const void *buf, size_t size,
> +				   const char *path, unsigned flags);
> +
>  /*
>   * Tell the object database to optimize for adding
>   * multiple objects. end_odb_transaction must be called