Re: [PATCH v5 0/5] merge-ort: implement support for packing objects together

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Oct 23, 2023 at 06:44:49PM -0400, Taylor Blau wrote:
> (Rebased onto the current tip of 'master', which is ceadf0f3cf (The
> twentieth batch, 2023-10-20)).
> 
> This series implements support for a new merge-tree option,
> `--write-pack`, which causes any newly-written objects to be packed
> together instead of being stored individually as loose.
> 
> This is a minor follow-up that could be taken instead of v4 (though the
> changes between these two most recent rounds are stylistic and a matter
> of subjective opinion).
> 
> This moves us the bulk_checkin_source structure introduced in response
> to Junio's suggestion during the last round further in the OOP
> direction. Instead of switching on the enum type of the source, have
> function pointers for read() and seek() respectively.
> 
> The functionality at the end is the same, but this avoids some of the
> namespacing issues that Peff pointed out while looking at v4. But I
> think that this approach ended up being less heavy-weight than I had
> originally imagined, so I think that this version is a worthwhile
> improvement over v4.
> 
> Beyond that, the changes since last time can be viewed in the range-diff
> below. Thanks in advance for any review!

Overall this version looks good to me. I've only got two smallish nits
and one question regarding the tests.

Thanks!

Patrick

> [1]: https://lore.kernel.org/git/xmqq34y7plj4.fsf@gitster.g/
> 
> Taylor Blau (5):
>   bulk-checkin: extract abstract `bulk_checkin_source`
>   bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
>   bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
>   bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
>   builtin/merge-tree.c: implement support for `--write-pack`
> 
>  Documentation/git-merge-tree.txt |   4 +
>  builtin/merge-tree.c             |   5 +
>  bulk-checkin.c                   | 197 +++++++++++++++++++++++++++----
>  bulk-checkin.h                   |   8 ++
>  merge-ort.c                      |  42 +++++--
>  merge-recursive.h                |   1 +
>  t/t4301-merge-tree-write-tree.sh |  93 +++++++++++++++
>  7 files changed, 316 insertions(+), 34 deletions(-)
> 
> Range-diff against v4:
> 1:  97bb6e9f59 ! 1:  696aa027e4 bulk-checkin: extract abstract `bulk_checkin_source`
>     @@ bulk-checkin.c: static int already_written(struct bulk_checkin_packfile *state,
>       }
>       
>      +struct bulk_checkin_source {
>     -+	enum { SOURCE_FILE } type;
>     ++	off_t (*read)(struct bulk_checkin_source *, void *, size_t);
>     ++	off_t (*seek)(struct bulk_checkin_source *, off_t);
>      +
>     -+	/* SOURCE_FILE fields */
>     -+	int fd;
>     ++	union {
>     ++		struct {
>     ++			int fd;
>     ++		} from_fd;
>     ++	} data;
>      +
>     -+	/* common fields */
>      +	size_t size;
>      +	const char *path;
>      +};
>      +
>     -+static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source,
>     -+					 off_t offset)
>     ++static off_t bulk_checkin_source_read_from_fd(struct bulk_checkin_source *source,
>     ++					      void *buf, size_t nr)
>      +{
>     -+	switch (source->type) {
>     -+	case SOURCE_FILE:
>     -+		return lseek(source->fd, offset, SEEK_SET);
>     -+	default:
>     -+		BUG("unknown bulk-checkin source: %d", source->type);
>     -+	}
>     ++	return read_in_full(source->data.from_fd.fd, buf, nr);
>      +}
>      +
>     -+static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
>     -+					void *buf, size_t nr)
>     ++static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source,
>     ++					      off_t offset)
>      +{
>     -+	switch (source->type) {
>     -+	case SOURCE_FILE:
>     -+		return read_in_full(source->fd, buf, nr);
>     -+	default:
>     -+		BUG("unknown bulk-checkin source: %d", source->type);
>     -+	}
>     ++	return lseek(source->data.from_fd.fd, offset, SEEK_SET);
>     ++}
>     ++
>     ++static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
>     ++					     int fd, size_t size,
>     ++					     const char *path)
>     ++{
>     ++	memset(source, 0, sizeof(struct bulk_checkin_source));
>     ++
>     ++	source->read = bulk_checkin_source_read_from_fd;
>     ++	source->seek = bulk_checkin_source_seek_from_fd;
>     ++
>     ++	source->data.from_fd.fd = fd;
>     ++
>     ++	source->size = size;
>     ++	source->path = path;
>      +}
>      +
>       /*
>     @@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *sta
>      -			ssize_t read_result = read_in_full(fd, ibuf, rsize);
>      +			ssize_t read_result;
>      +
>     -+			read_result = bulk_checkin_source_read(source, ibuf,
>     -+							       rsize);
>     ++			read_result = source->read(source, ibuf, rsize);
>       			if (read_result < 0)
>      -				die_errno("failed to read from '%s'", path);
>      +				die_errno("failed to read from '%s'",
>     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
>       	unsigned header_len;
>       	struct hashfile_checkpoint checkpoint = {0};
>       	struct pack_idx_entry *idx = NULL;
>     -+	struct bulk_checkin_source source = {
>     -+		.type = SOURCE_FILE,
>     -+		.fd = fd,
>     -+		.size = size,
>     -+		.path = path,
>     -+	};
>     ++	struct bulk_checkin_source source;
>     ++
>     ++	init_bulk_checkin_source_from_fd(&source, fd, size, path);
>       
>       	seekback = lseek(fd, 0, SEEK_CUR);
>       	if (seekback == (off_t) -1)
>     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
>       		state->offset = checkpoint.offset;
>       		flush_bulk_checkin_packfile(state);
>      -		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
>     -+		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
>     ++		if (source.seek(&source, seekback) == (off_t)-1)
>       			return error("cannot seek back");
>       	}
>       	the_hash_algo->final_oid_fn(result_oid, &ctx);
> 2:  9d633df339 < -:  ---------- bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
> 3:  d5bbd7810e ! 2:  596bd028a7 bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source`
>     @@ Metadata
>      Author: Taylor Blau <me@xxxxxxxxxxxx>
>      
>       ## Commit message ##
>     -    bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source`
>     +    bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
>      
>     -    Prepare for a future change where we will want to use a routine very
>     -    similar to the existing `deflate_blob_to_pack()` but over arbitrary
>     -    sources (i.e. either open file-descriptors, or a location in memory).
>     +    The existing `stream_blob_to_pack()` function is named based on the fact
>     +    that it knows only how to stream blobs into a bulk-checkin pack.
>      
>     -    Extract out a common "deflate_obj_to_pack()" routine that acts on a
>     -    bulk_checkin_source, instead of a (int, size_t) pair. Then rewrite
>     -    `deflate_blob_to_pack()` in terms of it.
>     +    But there is no longer anything in this function which prevents us from
>     +    writing objects of arbitrary types to the bulk-checkin pack. Prepare to
>     +    write OBJ_TREEs by removing this assumption, adding an `enum
>     +    object_type` parameter to this function's argument list, and renaming it
>     +    to `stream_obj_to_pack()` accordingly.
>      
>          Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx>
>      
>       ## bulk-checkin.c ##
>     +@@ bulk-checkin.c: static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
>     +  * status before calling us just in case we ask it to call us again
>     +  * with a new pack.
>     +  */
>     +-static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
>     +-			       git_hash_ctx *ctx, off_t *already_hashed_to,
>     +-			       struct bulk_checkin_source *source,
>     +-			       unsigned flags)
>     ++static int stream_obj_to_pack(struct bulk_checkin_packfile *state,
>     ++			      git_hash_ctx *ctx, off_t *already_hashed_to,
>     ++			      struct bulk_checkin_source *source,
>     ++			      enum object_type type, unsigned flags)
>     + {
>     + 	git_zstream s;
>     + 	unsigned char ibuf[16384];
>     +@@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
>     + 
>     + 	git_deflate_init(&s, pack_compression_level);
>     + 
>     +-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
>     +-					      size);
>     ++	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), type, size);
>     + 	s.next_out = obuf + hdrlen;
>     + 	s.avail_out = sizeof(obuf) - hdrlen;
>     + 
>      @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *state,
>       		die_errno("unable to write pack header");
>       }
>     @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *stat
>       	unsigned header_len;
>       	struct hashfile_checkpoint checkpoint = {0};
>       	struct pack_idx_entry *idx = NULL;
>     --	struct bulk_checkin_source source = {
>     --		.type = SOURCE_FILE,
>     --		.fd = fd,
>     --		.size = size,
>     --		.path = path,
>     --	};
>     +-	struct bulk_checkin_source source;
>       
>     +-	init_bulk_checkin_source_from_fd(&source, fd, size, path);
>     +-
>      -	seekback = lseek(fd, 0, SEEK_CUR);
>      -	if (seekback == (off_t) -1)
>      -		return error("cannot find the current offset");
>     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
>       		prepare_to_stream(state, flags);
>       		if (idx) {
>      @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
>     + 			idx->offset = state->offset;
>       			crc32_begin(state->f);
>       		}
>     - 		if (!stream_obj_to_pack(state, &ctx, &already_hashed_to,
>     --					&source, OBJ_BLOB, flags))
>     +-		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
>     +-					 &source, flags))
>     ++		if (!stream_obj_to_pack(state, &ctx, &already_hashed_to,
>      +					source, type, flags))
>       			break;
>       		/*
>     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
>       		hashfile_truncate(state->f, &checkpoint);
>       		state->offset = checkpoint.offset;
>       		flush_bulk_checkin_packfile(state);
>     --		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
>     -+		if (bulk_checkin_source_seek_to(source, seekback) == (off_t)-1)
>     +-		if (source.seek(&source, seekback) == (off_t)-1)
>     ++		if (source->seek(source, seekback) == (off_t)-1)
>       			return error("cannot seek back");
>       	}
>       	the_hash_algo->final_oid_fn(result_oid, &ctx);
>     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
>      +				int fd, size_t size,
>      +				const char *path, unsigned flags)
>      +{
>     -+	struct bulk_checkin_source source = {
>     -+		.type = SOURCE_FILE,
>     -+		.fd = fd,
>     -+		.size = size,
>     -+		.path = path,
>     -+	};
>     -+	off_t seekback = lseek(fd, 0, SEEK_CUR);
>     ++	struct bulk_checkin_source source;
>     ++	off_t seekback;
>     ++
>     ++	init_bulk_checkin_source_from_fd(&source, fd, size, path);
>     ++
>     ++	seekback = lseek(fd, 0, SEEK_CUR);
>      +	if (seekback == (off_t) -1)
>      +		return error("cannot find the current offset");
>      +
> 4:  e427fe6ad3 < -:  ---------- bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
> 5:  48095afe80 ! 3:  d8cf8e4395 bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
>     @@ Metadata
>       ## Commit message ##
>          bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
>      
>     -    Now that we have factored out many of the common routines necessary to
>     -    index a new object into a pack created by the bulk-checkin machinery, we
>     -    can introduce a variant of `index_blob_bulk_checkin()` that acts on
>     -    blobs whose contents we can fit in memory.
>     +    Introduce `index_blob_bulk_checkin_incore()` which allows streaming
>     +    arbitrary blob contents from memory into the bulk-checkin pack.
>     +
>     +    In order to support streaming from a location in memory, we must
>     +    implement a new kind of bulk_checkin_source that does just that. These
>     +    implementation in spread out across:
>     +
>     +      - init_bulk_checkin_source_incore()
>     +      - bulk_checkin_source_read_incore()
>     +      - bulk_checkin_source_seek_incore()
>     +
>     +    Note that, unlike file descriptors, which manage their own offset
>     +    internally, we have to keep track of how many bytes we've read out of
>     +    the buffer, and make sure we don't read past the end of the buffer.
>      
>          This will be useful in a couple of more commits in order to provide the
>          `merge-tree` builtin with a mechanism to create a new pack containing
>     @@ Commit message
>          Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx>
>      
>       ## bulk-checkin.c ##
>     +@@ bulk-checkin.c: struct bulk_checkin_source {
>     + 		struct {
>     + 			int fd;
>     + 		} from_fd;
>     ++		struct {
>     ++			const void *buf;
>     ++			size_t nr_read;
>     ++		} incore;
>     + 	} data;
>     + 
>     + 	size_t size;
>     +@@ bulk-checkin.c: static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source
>     + 	return lseek(source->data.from_fd.fd, offset, SEEK_SET);
>     + }
>     + 
>     ++static off_t bulk_checkin_source_read_incore(struct bulk_checkin_source *source,
>     ++					     void *buf, size_t nr)
>     ++{
>     ++	const unsigned char *src = source->data.incore.buf;
>     ++
>     ++	if (source->data.incore.nr_read > source->size)
>     ++		BUG("read beyond bulk-checkin source buffer end "
>     ++		    "(%"PRIuMAX" > %"PRIuMAX")",
>     ++		    (uintmax_t)source->data.incore.nr_read,
>     ++		    (uintmax_t)source->size);
>     ++
>     ++	if (nr > source->size - source->data.incore.nr_read)
>     ++		nr = source->size - source->data.incore.nr_read;
>     ++
>     ++	src += source->data.incore.nr_read;
>     ++
>     ++	memcpy(buf, src, nr);
>     ++	source->data.incore.nr_read += nr;
>     ++	return nr;
>     ++}
>     ++
>     ++static off_t bulk_checkin_source_seek_incore(struct bulk_checkin_source *source,
>     ++					     off_t offset)
>     ++{
>     ++	if (!(0 <= offset && offset < source->size))
>     ++		return (off_t)-1;
>     ++	source->data.incore.nr_read = offset;
>     ++	return source->data.incore.nr_read;
>     ++}
>     ++
>     + static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
>     + 					     int fd, size_t size,
>     + 					     const char *path)
>     +@@ bulk-checkin.c: static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
>     + 	source->path = path;
>     + }
>     + 
>     ++static void init_bulk_checkin_source_incore(struct bulk_checkin_source *source,
>     ++					    const void *buf, size_t size,
>     ++					    const char *path)
>     ++{
>     ++	memset(source, 0, sizeof(struct bulk_checkin_source));
>     ++
>     ++	source->read = bulk_checkin_source_read_incore;
>     ++	source->seek = bulk_checkin_source_seek_incore;
>     ++
>     ++	source->data.incore.buf = buf;
>     ++	source->data.incore.nr_read = 0;
>     ++
>     ++	source->size = size;
>     ++	source->path = path;
>     ++}
>     ++
>     + /*
>     +  * Read the contents from 'source' for 'size' bytes, streaming it to the
>     +  * packfile in state while updating the hash in ctx. Signal a failure
>      @@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *state,
>       	return 0;
>       }
>     @@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *sta
>      +				       const char *path, enum object_type type,
>      +				       unsigned flags)
>      +{
>     -+	struct bulk_checkin_source source = {
>     -+		.type = SOURCE_INCORE,
>     -+		.buf = buf,
>     -+		.size = size,
>     -+		.read = 0,
>     -+		.path = path,
>     -+	};
>     ++	struct bulk_checkin_source source;
>     ++
>     ++	init_bulk_checkin_source_incore(&source, buf, size, path);
>      +
>      +	return deflate_obj_to_pack(state, result_oid, &source, type, 0, flags);
>      +}
> 6:  60568f9281 = 4:  2670192802 bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
> 7:  b9be9df122 ! 5:  3595db76a5 builtin/merge-tree.c: implement support for `--write-pack`
>     @@ Documentation/git-merge-tree.txt: OPTIONS
>      
>       ## builtin/merge-tree.c ##
>      @@
>     - #include "quote.h"
>       #include "tree.h"
>       #include "config.h"
>     + #include "strvec.h"
>      +#include "bulk-checkin.h"
>       
>       static int line_termination = '\n';
>       
>      @@ builtin/merge-tree.c: struct merge_tree_options {
>     - 	int show_messages;
>       	int name_only;
>       	int use_stdin;
>     + 	struct merge_options merge_options;
>      +	int write_pack;
>       };
>       
>       static int real_merge(struct merge_tree_options *o,
>      @@ builtin/merge-tree.c: static int real_merge(struct merge_tree_options *o,
>     - 	init_merge_options(&opt, the_repository);
>     + 				 _("not something we can merge"));
>       
>       	opt.show_rename_progress = 0;
>      +	opt.write_pack = o->write_pack;
>     @@ builtin/merge-tree.c: static int real_merge(struct merge_tree_options *o,
>       	opt.branch1 = branch1;
>       	opt.branch2 = branch2;
>      @@ builtin/merge-tree.c: int cmd_merge_tree(int argc, const char **argv, const char *prefix)
>     - 			   &merge_base,
>     - 			   N_("commit"),
>       			   N_("specify a merge-base for the merge")),
>     + 		OPT_STRVEC('X', "strategy-option", &xopts, N_("option=value"),
>     + 			N_("option for selected merge strategy")),
>      +		OPT_BOOL(0, "write-pack", &o.write_pack,
>      +			 N_("write new objects to a pack instead of as loose")),
>       		OPT_END()
> 
> base-commit: ceadf0f3cf51550166a387ec8508bb55e7883057
> -- 
> 2.42.0.425.g963d08ddb3.dirty

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux